From e626f85a483cafc3d2011864d7526670df188ffd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Staniewski?= Date: Mon, 4 Aug 2025 16:21:31 +0200 Subject: [PATCH 1/3] Fix allocation lookup in EnqueuedPool (#19638) This PR updates the allocation management algorithm to address cases where it fails to find suitable allocations due to how it performs lower bound searches. Example: ``` Freelist { Allocation(align=64, size=128), Allocation(align=64, size=256), Allocation(align=4096, size=128), Allocation(align=4096, size=1024), } ``` If we request `align=64`, `size=512`, the current code looks at `Allocation(align=4096, size=128)` and skips the rest, even though `Allocation(align=4096, size=1024)` would work. This PR introduces grouping the allocations by queue and alignment. --- source/adapters/level_zero/enqueued_pool.cpp | 128 +++++++++++++------ source/adapters/level_zero/enqueued_pool.hpp | 42 ++++-- test/adapters/level_zero/enqueue_alloc.cpp | 44 +++++++ 3 files changed, 163 insertions(+), 51 deletions(-) diff --git a/source/adapters/level_zero/enqueued_pool.cpp b/source/adapters/level_zero/enqueued_pool.cpp index 0df60df92f..637006256c 100644 --- a/source/adapters/level_zero/enqueued_pool.cpp +++ b/source/adapters/level_zero/enqueued_pool.cpp @@ -13,32 +13,71 @@ #include -EnqueuedPool::~EnqueuedPool() { cleanup(); } +namespace { std::optional -EnqueuedPool::getBestFit(size_t Size, size_t Alignment, void *Queue) { - auto Lock = std::lock_guard(Mutex); +getBestFitHelper(size_t Size, size_t Alignment, void *Queue, + EnqueuedPool::AllocationGroupMap &Freelist) { + // Iterate over the alignments for a given queue. + auto GroupIt = Freelist.lower_bound({Queue, Alignment}); + for (; GroupIt != Freelist.end() && GroupIt->first.Queue == Queue; + ++GroupIt) { + auto &AllocSet = GroupIt->second; + // Find the first allocation that is large enough. + auto AllocIt = AllocSet.lower_bound({nullptr, Size, nullptr, nullptr, 0}); + if (AllocIt != AllocSet.end()) { + auto BestFit = *AllocIt; + AllocSet.erase(AllocIt); + if (AllocSet.empty()) { + Freelist.erase(GroupIt); + } + return BestFit; + } + } + return std::nullopt; +} - Allocation Alloc = {nullptr, Size, nullptr, Queue, Alignment}; +void removeFromFreelist(const EnqueuedPool::Allocation &Alloc, + EnqueuedPool::AllocationGroupMap &Freelist, + bool IsGlobal) { + const EnqueuedPool::AllocationGroupKey Key = { + IsGlobal ? nullptr : Alloc.Queue, Alloc.Alignment}; - auto It = Freelist.lower_bound(Alloc); - if (It != Freelist.end() && It->Size >= Size && It->Queue == Queue && - It->Alignment >= Alignment) { - Allocation BestFit = *It; - Freelist.erase(It); + auto GroupIt = Freelist.find(Key); + assert(GroupIt != Freelist.end() && "Allocation group not found in freelist"); - return BestFit; + auto &AllocSet = GroupIt->second; + auto AllocIt = AllocSet.find(Alloc); + assert(AllocIt != AllocSet.end() && "Allocation not found in group"); + + AllocSet.erase(AllocIt); + if (AllocSet.empty()) { + Freelist.erase(GroupIt); } +} - // To make sure there's no match on other queues, we need to reset it to - // nullptr and try again. - Alloc.Queue = nullptr; - It = Freelist.lower_bound(Alloc); +} // namespace - if (It != Freelist.end() && It->Size >= Size && It->Alignment >= Alignment) { - Allocation BestFit = *It; - Freelist.erase(It); +EnqueuedPool::~EnqueuedPool() { cleanup(); } +std::optional +EnqueuedPool::getBestFit(size_t Size, size_t Alignment, void *Queue) { + auto Lock = std::lock_guard(Mutex); + + // First, try to find the best fit in the queue-specific freelist. + auto BestFit = getBestFitHelper(Size, Alignment, Queue, FreelistByQueue); + if (BestFit) { + // Remove the allocation from the global freelist as well. + removeFromFreelist(*BestFit, FreelistGlobal, true); + return BestFit; + } + + // If no fit was found in the queue-specific freelist, try the global + // freelist. + BestFit = getBestFitHelper(Size, Alignment, nullptr, FreelistGlobal); + if (BestFit) { + // Remove the allocation from the queue-specific freelist. + removeFromFreelist(*BestFit, FreelistByQueue, false); return BestFit; } @@ -52,45 +91,54 @@ void EnqueuedPool::insert(void *Ptr, size_t Size, ur_event_handle_t Event, uintptr_t Address = (uintptr_t)Ptr; size_t Alignment = Address & (~Address + 1); - Freelist.emplace(Allocation{Ptr, Size, Event, Queue, Alignment}); + Allocation Alloc = {Ptr, Size, Event, Queue, Alignment}; + FreelistByQueue[{Queue, Alignment}].emplace(Alloc); + FreelistGlobal[{nullptr, Alignment}].emplace(Alloc); } bool EnqueuedPool::cleanup() { auto Lock = std::lock_guard(Mutex); - auto FreedAllocations = !Freelist.empty(); + auto FreedAllocations = !FreelistGlobal.empty(); auto Ret [[maybe_unused]] = UR_RESULT_SUCCESS; - for (auto It : Freelist) { - Ret = MemFreeFn(It.Ptr); - assert(Ret == UR_RESULT_SUCCESS); - - if (It.Event) - EventReleaseFn(It.Event); + for (const auto &[GroupKey, AllocSet] : FreelistGlobal) { + for (const auto &Alloc : AllocSet) { + Ret = MemFreeFn(Alloc.Ptr); + assert(Ret == UR_RESULT_SUCCESS); + + if (Alloc.Event) { + EventReleaseFn(Alloc.Event); + } + } } - Freelist.clear(); + + FreelistGlobal.clear(); + FreelistByQueue.clear(); return FreedAllocations; } bool EnqueuedPool::cleanupForQueue(void *Queue) { auto Lock = std::lock_guard(Mutex); - - Allocation Alloc = {nullptr, 0, nullptr, Queue, 0}; - // first allocation on the freelist with the specific queue - auto It = Freelist.lower_bound(Alloc); - bool FreedAllocations = false; auto Ret [[maybe_unused]] = UR_RESULT_SUCCESS; - while (It != Freelist.end() && It->Queue == Queue) { - Ret = MemFreeFn(It->Ptr); - assert(Ret == UR_RESULT_SUCCESS); - - if (It->Event) - EventReleaseFn(It->Event); - - // Erase the current allocation and move to the next one - It = Freelist.erase(It); + auto GroupIt = FreelistByQueue.lower_bound({Queue, 0}); + while (GroupIt != FreelistByQueue.end() && GroupIt->first.Queue == Queue) { + auto &AllocSet = GroupIt->second; + for (const auto &Alloc : AllocSet) { + Ret = MemFreeFn(Alloc.Ptr); + assert(Ret == UR_RESULT_SUCCESS); + + if (Alloc.Event) { + EventReleaseFn(Alloc.Event); + } + + removeFromFreelist(Alloc, FreelistGlobal, true); + } + + // Move to the next group. + GroupIt = FreelistByQueue.erase(GroupIt); FreedAllocations = true; } diff --git a/source/adapters/level_zero/enqueued_pool.hpp b/source/adapters/level_zero/enqueued_pool.hpp index 66577b7652..8d582d2414 100644 --- a/source/adapters/level_zero/enqueued_pool.hpp +++ b/source/adapters/level_zero/enqueued_pool.hpp @@ -13,6 +13,7 @@ #include "ur_api.h" #include "ur_pool_manager.hpp" +#include #include #include @@ -43,25 +44,44 @@ class EnqueuedPool { bool cleanup(); bool cleanupForQueue(void *Queue); -private: - struct Comparator { - bool operator()(const Allocation &lhs, const Allocation &rhs) const { + // Allocations are grouped by queue and alignment. + struct AllocationGroupKey { + void *Queue; + size_t Alignment; + }; + + struct GroupComparator { + bool operator()(const AllocationGroupKey &lhs, + const AllocationGroupKey &rhs) const { if (lhs.Queue != rhs.Queue) { - return lhs.Queue < rhs.Queue; // Compare by queue handle first - } - if (lhs.Alignment != rhs.Alignment) { - return lhs.Alignment < rhs.Alignment; // Then by alignment + return lhs.Queue < rhs.Queue; } + return lhs.Alignment < rhs.Alignment; + } + }; + + // Then, the allocations are sorted by size. + struct SizeComparator { + bool operator()(const Allocation &lhs, const Allocation &rhs) const { if (lhs.Size != rhs.Size) { - return lhs.Size < rhs.Size; // Then by size + return lhs.Size < rhs.Size; } - return lhs.Ptr < rhs.Ptr; // Finally by pointer address + return lhs.Ptr < rhs.Ptr; } }; - using AllocationSet = std::set; + using AllocationGroup = std::set; + using AllocationGroupMap = + std::map; + +private: ur_mutex Mutex; - AllocationSet Freelist; + + // Freelist grouped by queue and alignment. + AllocationGroupMap FreelistByQueue; + // Freelist grouped by alignment only. + AllocationGroupMap FreelistGlobal; + event_release_callback_t EventReleaseFn; memory_free_callback_t MemFreeFn; }; diff --git a/test/adapters/level_zero/enqueue_alloc.cpp b/test/adapters/level_zero/enqueue_alloc.cpp index c2c44f317e..fa7c3ac55c 100644 --- a/test/adapters/level_zero/enqueue_alloc.cpp +++ b/test/adapters/level_zero/enqueue_alloc.cpp @@ -758,3 +758,47 @@ TEST_P(urL0EnqueueAllocMultiQueueMultiDeviceTest, ASSERT_NE(freeEvent, nullptr); } } + +using urL0EnqueueAllocStandaloneTest = uur::urQueueTest; +UUR_INSTANTIATE_DEVICE_TEST_SUITE(urL0EnqueueAllocStandaloneTest); + +TEST_P(urL0EnqueueAllocStandaloneTest, ReuseFittingAllocation) { + ur_usm_pool_handle_t pool = nullptr; + ur_usm_pool_desc_t pool_desc = {}; + ASSERT_SUCCESS(urUSMPoolCreate(context, &pool_desc, &pool)); + + auto makeAllocation = [&](uint32_t alignment, size_t size, void **ptr) { + const ur_usm_device_desc_t usm_device_desc{ + UR_STRUCTURE_TYPE_USM_DEVICE_DESC, nullptr, + /* device flags */ 0}; + + const ur_usm_desc_t usm_desc{UR_STRUCTURE_TYPE_USM_DESC, &usm_device_desc, + UR_USM_ADVICE_FLAG_DEFAULT, alignment}; + + ASSERT_SUCCESS( + urUSMDeviceAlloc(context, device, &usm_desc, pool, size, ptr)); + }; + + std::array allocations = {}; + makeAllocation(64, 128, &allocations[0]); + makeAllocation(64, 256, &allocations[1]); + makeAllocation(4096, 512, &allocations[2]); + makeAllocation(4096, 8192, &allocations[3]); + + ASSERT_SUCCESS( + urEnqueueUSMFreeExp(queue, pool, allocations[0], 0, nullptr, nullptr)); + ASSERT_SUCCESS( + urEnqueueUSMFreeExp(queue, pool, allocations[1], 0, nullptr, nullptr)); + ASSERT_SUCCESS( + urEnqueueUSMFreeExp(queue, pool, allocations[2], 0, nullptr, nullptr)); + ASSERT_SUCCESS( + urEnqueueUSMFreeExp(queue, pool, allocations[3], 0, nullptr, nullptr)); + + void *ptr = nullptr; + ASSERT_SUCCESS(urEnqueueUSMDeviceAllocExp(queue, pool, 8192, nullptr, 0, + nullptr, &ptr, nullptr)); + + ASSERT_EQ(ptr, allocations[3]); // Fitting allocation should be reused. + ASSERT_SUCCESS(urEnqueueUSMFreeExp(queue, pool, ptr, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); +} From 31787fc3134dc95596aad701842a633ba89c4c58 Mon Sep 17 00:00:00 2001 From: aarongreig Date: Mon, 4 Aug 2025 17:31:53 +0100 Subject: [PATCH 2/3] Revert "[UR][SYCL] Introduce UR api to set kernel args + launch in one call." (#19661) Reverts intel/llvm#18764 Looks like the naive level zero implementation now results in a performance regression, I think it's best to revert until l0 can better match the new interface. --- include/ur_api.h | 184 ----------- include/ur_api_funcs.def | 1 - include/ur_ddi.h | 10 - include/ur_print.h | 40 --- include/ur_print.hpp | 308 +----------------- .../EXP-ENQUEUE-KERNEL-LAUNCH-WITH-ARGS.rst | 77 ----- .../exp-enqueue-kernel-launch-with-args.yml | 170 ---------- scripts/core/registry.yml | 3 - scripts/parse_specs.py | 4 +- scripts/templates/helper.py | 7 +- scripts/templates/print.hpp.mako | 29 +- source/adapters/cuda/enqueue.cpp | 55 ---- source/adapters/cuda/ur_interface_loader.cpp | 1 - source/adapters/hip/enqueue.cpp | 55 ---- source/adapters/hip/ur_interface_loader.cpp | 1 - source/adapters/level_zero/kernel.cpp | 167 ---------- .../level_zero/ur_interface_loader.cpp | 2 - .../level_zero/ur_interface_loader.hpp | 9 - .../level_zero/v2/command_list_manager.cpp | 58 ---- .../level_zero/v2/command_list_manager.hpp | 10 - source/adapters/level_zero/v2/kernel.cpp | 29 +- source/adapters/level_zero/v2/queue_api.cpp | 16 - source/adapters/level_zero/v2/queue_api.hpp | 5 - .../v2/queue_immediate_in_order.hpp | 16 - .../v2/queue_immediate_out_of_order.hpp | 18 - source/adapters/mock/ur_mockddi.cpp | 104 ------ source/adapters/native_cpu/enqueue.cpp | 42 --- source/adapters/native_cpu/kernel.cpp | 27 +- source/adapters/native_cpu/kernel.hpp | 31 +- .../native_cpu/ur_interface_loader.cpp | 1 - source/adapters/offload/enqueue.cpp | 37 --- source/adapters/offload/kernel.cpp | 9 + source/adapters/offload/kernel.hpp | 11 +- .../adapters/offload/ur_interface_loader.cpp | 1 - source/adapters/opencl/enqueue.cpp | 103 ------ .../adapters/opencl/ur_interface_loader.cpp | 1 - source/common/stype_map_helpers.def | 3 - .../loader/layers/sanitizer/asan/asan_ddi.cpp | 134 -------- .../loader/layers/sanitizer/msan/msan_ddi.cpp | 137 -------- .../loader/layers/sanitizer/tsan/tsan_ddi.cpp | 135 -------- source/loader/layers/tracing/ur_trcddi.cpp | 96 ------ source/loader/layers/validation/ur_valddi.cpp | 120 ------- source/loader/loader.def.in | 5 - source/loader/loader.map.in | 5 - source/loader/ur_ldrddi.cpp | 61 ---- source/loader/ur_libapi.cpp | 98 ------ source/loader/ur_print.cpp | 32 -- source/ur_api.cpp | 89 ----- test/conformance/CMakeLists.txt | 1 - .../CMakeLists.txt | 9 - .../urEnqueueKernelLaunchWithArgsExp.cpp | 303 ----------------- 51 files changed, 73 insertions(+), 2797 deletions(-) delete mode 100644 scripts/core/EXP-ENQUEUE-KERNEL-LAUNCH-WITH-ARGS.rst delete mode 100644 scripts/core/exp-enqueue-kernel-launch-with-args.yml delete mode 100644 test/conformance/exp_enqueue_kernel_launch_with_args/CMakeLists.txt delete mode 100644 test/conformance/exp_enqueue_kernel_launch_with_args/urEnqueueKernelLaunchWithArgsExp.cpp diff --git a/include/ur_api.h b/include/ur_api.h index 5882015c55..1bba8a950e 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -475,8 +475,6 @@ typedef enum ur_function_t { UR_FUNCTION_MEMORY_EXPORT_EXPORT_MEMORY_HANDLE_EXP = 287, /// Enumerator for ::urBindlessImagesSupportsImportingHandleTypeExp UR_FUNCTION_BINDLESS_IMAGES_SUPPORTS_IMPORTING_HANDLE_TYPE_EXP = 288, - /// Enumerator for ::urEnqueueKernelLaunchWithArgsExp - UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP = 289, /// @cond UR_FUNCTION_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -590,8 +588,6 @@ typedef enum ur_structure_type_t { UR_STRUCTURE_TYPE_EXP_ENQUEUE_NATIVE_COMMAND_PROPERTIES = 0x3000, /// ::ur_exp_enqueue_ext_properties_t UR_STRUCTURE_TYPE_EXP_ENQUEUE_EXT_PROPERTIES = 0x4000, - /// ::ur_exp_kernel_arg_properties_t - UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES = 0x5000, /// @cond UR_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -12847,166 +12843,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( /// propName. size_t *pPropSizeRet); -#if !defined(__GNUC__) -#pragma endregion -#endif -// Intel 'oneAPI' Unified Runtime Experimental API for setting args at kernel -// launch -#if !defined(__GNUC__) -#pragma region enqueue_kernel_launch_with_args_(experimental) -#endif -/////////////////////////////////////////////////////////////////////////////// -/// @brief What kind of kernel arg is this -typedef enum ur_exp_kernel_arg_type_t { - /// Kernel arg is a value. - UR_EXP_KERNEL_ARG_TYPE_VALUE = 0, - /// Kernel arg is a pointer. - UR_EXP_KERNEL_ARG_TYPE_POINTER = 1, - /// Kernel arg is a memory object. - UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ = 2, - /// Kernel arg is a local allocation. - UR_EXP_KERNEL_ARG_TYPE_LOCAL = 3, - /// Kernel arg is a sampler. - UR_EXP_KERNEL_ARG_TYPE_SAMPLER = 4, - /// @cond - UR_EXP_KERNEL_ARG_TYPE_FORCE_UINT32 = 0x7fffffff - /// @endcond - -} ur_exp_kernel_arg_type_t; - -/////////////////////////////////////////////////////////////////////////////// -/// @brief Mem obj/properties tuple -typedef struct ur_exp_kernel_arg_mem_obj_tuple_t { - /// [in] Handle of a memory object - ur_mem_handle_t hMem; - /// [in] Memory flags to associate with `hMem`. Allowed values are: - /// ::UR_MEM_FLAG_READ_WRITE, ::UR_MEM_FLAG_WRITE_ONLY, - /// ::UR_MEM_FLAG_READ_ONLY. - ur_mem_flags_t flags; - -} ur_exp_kernel_arg_mem_obj_tuple_t; - -/////////////////////////////////////////////////////////////////////////////// -/// @brief Typesafe container for a kernel argument value -typedef union ur_exp_kernel_arg_value_t { - /// [in] argument value represented as matching arg type. - /// The data pointed to will be copied and therefore can be reused on return. - const void *value; - /// [in] Allocation obtained by USM allocation or virtual memory mapping - /// operation, or pointer to a literal value. - const void *pointer; - /// [in] Struct containing a memory object and associated flags. - ur_exp_kernel_arg_mem_obj_tuple_t memObjTuple; - /// [in] Handle of a sampler object. - ur_sampler_handle_t sampler; - -} ur_exp_kernel_arg_value_t; - -/////////////////////////////////////////////////////////////////////////////// -/// @brief Kernel arg properties -typedef struct ur_exp_kernel_arg_properties_t { - /// [in] type of this structure, must be - /// ::UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES - ur_structure_type_t stype; - /// [in,out][optional] pointer to extension-specific structure - void *pNext; - /// [in] type of the kernel arg - ur_exp_kernel_arg_type_t type; - /// [in] index of the kernel arg - uint32_t index; - /// [in] size of the kernel arg - size_t size; - /// [in][tagged_by(type)] Union containing the argument value. - ur_exp_kernel_arg_value_t value; - -} ur_exp_kernel_arg_properties_t; - -/////////////////////////////////////////////////////////////////////////////// -/// @brief Enqueue a command to execute a kernel -/// -/// @remarks -/// _Analogues_ -/// - **clEnqueueNDRangeKernel** -/// -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_UNINITIALIZED -/// - ::UR_RESULT_ERROR_DEVICE_LOST -/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC -/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE -/// + `NULL == hQueue` -/// + `NULL == hKernel` -/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// + `NULL == pGlobalWorkSize` -/// + `launchPropList == NULL && numPropsInLaunchPropList > 0` -/// + `pArgs == NULL && numArgs > 0` -/// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `NULL != pArgs && ::UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type` -/// - ::UR_RESULT_ERROR_INVALID_QUEUE -/// - ::UR_RESULT_ERROR_INVALID_KERNEL -/// - ::UR_RESULT_ERROR_INVALID_EVENT -/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST -/// + `phEventWaitList == NULL && numEventsInWaitList > 0` -/// + `phEventWaitList != NULL && numEventsInWaitList == 0` -/// + If event objects in phEventWaitList are not valid events. -/// - ::UR_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS -/// + An event in `phEventWaitList` has ::UR_EVENT_STATUS_ERROR. -/// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION -/// + `pGlobalWorkSize[0] == 0 || pGlobalWorkSize[1] == 0 || -/// pGlobalWorkSize[2] == 0` -/// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE -/// + `pLocalWorkSize && (pLocalWorkSize[0] == 0 || pLocalWorkSize[1] == -/// 0 || pLocalWorkSize[2] == 0)` -/// - ::UR_RESULT_ERROR_INVALID_VALUE -/// - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGS - "The kernel argument values -/// have not been specified." -/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY -/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES -/// - ::UR_RESULT_ERROR_INVALID_OPERATION -/// + If any property in `launchPropList` isn't supported by the device. -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( - /// [in] handle of the queue object - ur_queue_handle_t hQueue, - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] number of dimensions, from 1 to 3, to specify the global and - /// work-group work-items - uint32_t workDim, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the offset used to calculate the global ID of a work-item - const size_t *pGlobalWorkOffset, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of global work-items in workDim that will execute the kernel - /// function - const size_t *pGlobalWorkSize, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the number of local work-items forming a work-group that will - /// execute the kernel function. - /// If nullptr, the runtime implementation will choose the work-group size. - const size_t *pLocalWorkSize, - /// [in] Number of entries in pArgs - uint32_t numArgs, - /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg - /// properties. - const ur_exp_kernel_arg_properties_t *pArgs, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list - /// of launch properties - const ur_kernel_launch_property_t *launchPropList, - /// [in] size of the event wait list - uint32_t numEventsInWaitList, - /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of - /// events that must be complete before the kernel execution. - /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait - /// event. - const ur_event_handle_t *phEventWaitList, - /// [out][optional][alloc] return an event object that identifies this - /// particular kernel execution instance. If phEventWaitList and phEvent - /// are not NULL, phEvent must not refer to an element of the - /// phEventWaitList array. - ur_event_handle_t *phEvent); - #if !defined(__GNUC__) #pragma endregion #endif @@ -14609,26 +14445,6 @@ typedef struct ur_enqueue_write_host_pipe_params_t { ur_event_handle_t **pphEvent; } ur_enqueue_write_host_pipe_params_t; -/////////////////////////////////////////////////////////////////////////////// -/// @brief Function parameters for urEnqueueKernelLaunchWithArgsExp -/// @details Each entry is a pointer to the parameter passed to the function; -/// allowing the callback the ability to modify the parameter's value -typedef struct ur_enqueue_kernel_launch_with_args_exp_params_t { - ur_queue_handle_t *phQueue; - ur_kernel_handle_t *phKernel; - uint32_t *pworkDim; - const size_t **ppGlobalWorkOffset; - const size_t **ppGlobalWorkSize; - const size_t **ppLocalWorkSize; - uint32_t *pnumArgs; - const ur_exp_kernel_arg_properties_t **ppArgs; - uint32_t *pnumPropsInLaunchPropList; - const ur_kernel_launch_property_t **plaunchPropList; - uint32_t *pnumEventsInWaitList; - const ur_event_handle_t **pphEventWaitList; - ur_event_handle_t **pphEvent; -} ur_enqueue_kernel_launch_with_args_exp_params_t; - /////////////////////////////////////////////////////////////////////////////// /// @brief Function parameters for urEnqueueEventsWaitWithBarrierExt /// @details Each entry is a pointer to the parameter passed to the function; diff --git a/include/ur_api_funcs.def b/include/ur_api_funcs.def index 97092258a5..f0c92445b9 100644 --- a/include/ur_api_funcs.def +++ b/include/ur_api_funcs.def @@ -133,7 +133,6 @@ _UR_API(urEnqueueDeviceGlobalVariableRead) _UR_API(urEnqueueReadHostPipe) _UR_API(urEnqueueWriteHostPipe) _UR_API(urEnqueueEventsWaitWithBarrierExt) -_UR_API(urEnqueueKernelLaunchWithArgsExp) _UR_API(urEnqueueUSMDeviceAllocExp) _UR_API(urEnqueueUSMSharedAllocExp) _UR_API(urEnqueueUSMHostAllocExp) diff --git a/include/ur_ddi.h b/include/ur_ddi.h index b1033a027a..8ab686aa58 100644 --- a/include/ur_ddi.h +++ b/include/ur_ddi.h @@ -1097,15 +1097,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable( typedef ur_result_t(UR_APICALL *ur_pfnGetEnqueueProcAddrTable_t)( ur_api_version_t, ur_enqueue_dditable_t *); -/////////////////////////////////////////////////////////////////////////////// -/// @brief Function-pointer for urEnqueueKernelLaunchWithArgsExp -typedef ur_result_t(UR_APICALL *ur_pfnEnqueueKernelLaunchWithArgsExp_t)( - ur_queue_handle_t, ur_kernel_handle_t, uint32_t, const size_t *, - const size_t *, const size_t *, uint32_t, - const ur_exp_kernel_arg_properties_t *, uint32_t, - const ur_kernel_launch_property_t *, uint32_t, const ur_event_handle_t *, - ur_event_handle_t *); - /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urEnqueueUSMDeviceAllocExp typedef ur_result_t(UR_APICALL *ur_pfnEnqueueUSMDeviceAllocExp_t)( @@ -1156,7 +1147,6 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueNativeCommandExp_t)( /////////////////////////////////////////////////////////////////////////////// /// @brief Table of EnqueueExp functions pointers typedef struct ur_enqueue_exp_dditable_t { - ur_pfnEnqueueKernelLaunchWithArgsExp_t pfnKernelLaunchWithArgsExp; ur_pfnEnqueueUSMDeviceAllocExp_t pfnUSMDeviceAllocExp; ur_pfnEnqueueUSMSharedAllocExp_t pfnUSMSharedAllocExp; ur_pfnEnqueueUSMHostAllocExp_t pfnUSMHostAllocExp; diff --git a/include/ur_print.h b/include/ur_print.h index 3e1f03a3aa..8130df0c5b 100644 --- a/include/ur_print.h +++ b/include/ur_print.h @@ -1415,36 +1415,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintExpPeerInfo(enum ur_exp_peer_info_t value, char *buffer, const size_t buff_size, size_t *out_size); -/////////////////////////////////////////////////////////////////////////////// -/// @brief Print ur_exp_kernel_arg_type_t enum -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_SIZE -/// - `buff_size < out_size` -UR_APIEXPORT ur_result_t UR_APICALL -urPrintExpKernelArgType(enum ur_exp_kernel_arg_type_t value, char *buffer, - const size_t buff_size, size_t *out_size); - -/////////////////////////////////////////////////////////////////////////////// -/// @brief Print ur_exp_kernel_arg_mem_obj_tuple_t struct -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_SIZE -/// - `buff_size < out_size` -UR_APIEXPORT ur_result_t UR_APICALL urPrintExpKernelArgMemObjTuple( - const struct ur_exp_kernel_arg_mem_obj_tuple_t params, char *buffer, - const size_t buff_size, size_t *out_size); - -/////////////////////////////////////////////////////////////////////////////// -/// @brief Print ur_exp_kernel_arg_properties_t struct -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_SIZE -/// - `buff_size < out_size` -UR_APIEXPORT ur_result_t UR_APICALL urPrintExpKernelArgProperties( - const struct ur_exp_kernel_arg_properties_t params, char *buffer, - const size_t buff_size, size_t *out_size); - /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_exp_enqueue_ext_flag_t enum /// @returns @@ -2714,16 +2684,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueWriteHostPipeParams( const struct ur_enqueue_write_host_pipe_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); -/////////////////////////////////////////////////////////////////////////////// -/// @brief Print ur_enqueue_kernel_launch_with_args_exp_params_t struct -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_SIZE -/// - `buff_size < out_size` -UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueKernelLaunchWithArgsExpParams( - const struct ur_enqueue_kernel_launch_with_args_exp_params_t *params, - char *buffer, const size_t buff_size, size_t *out_size); - /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_enqueue_events_wait_with_barrier_ext_params_t struct /// @returns diff --git a/include/ur_print.hpp b/include/ur_print.hpp index 93cc0d5f2b..91c9973a3a 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -50,8 +50,6 @@ inline ur_result_t printFlag(std::ostream &os, uint32_t flag); template inline ur_result_t printTagged(std::ostream &os, const void *ptr, T value, size_t size); -template -inline ur_result_t printArray(std::ostream &os, const T *ptr); inline ur_result_t printStruct(std::ostream &os, const void *ptr); @@ -265,10 +263,6 @@ template <> inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_exp_peer_info_t value, size_t size); -inline ur_result_t printUnion(std::ostream &os, - const union ur_exp_kernel_arg_value_t params, - const enum ur_exp_kernel_arg_type_t tag); - template <> inline ur_result_t printFlag(std::ostream &os, uint32_t flag); @@ -598,14 +592,6 @@ operator<<(std::ostream &os, [[maybe_unused]] const struct ur_exp_command_buffer_update_kernel_launch_desc_t params); inline std::ostream &operator<<(std::ostream &os, enum ur_exp_peer_info_t value); -inline std::ostream &operator<<(std::ostream &os, - enum ur_exp_kernel_arg_type_t value); -inline std::ostream &operator<<( - std::ostream &os, - [[maybe_unused]] const struct ur_exp_kernel_arg_mem_obj_tuple_t params); -inline std::ostream & -operator<<(std::ostream &os, - [[maybe_unused]] const struct ur_exp_kernel_arg_properties_t params); inline std::ostream &operator<<(std::ostream &os, enum ur_exp_enqueue_ext_flag_t value); inline std::ostream &operator<<( @@ -1288,9 +1274,6 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_BINDLESS_IMAGES_SUPPORTS_IMPORTING_HANDLE_TYPE_EXP: os << "UR_FUNCTION_BINDLESS_IMAGES_SUPPORTS_IMPORTING_HANDLE_TYPE_EXP"; break; - case UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP: - os << "UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP"; - break; default: os << "unknown enumerator"; break; @@ -1460,9 +1443,6 @@ inline std::ostream &operator<<(std::ostream &os, case UR_STRUCTURE_TYPE_EXP_ENQUEUE_EXT_PROPERTIES: os << "UR_STRUCTURE_TYPE_EXP_ENQUEUE_EXT_PROPERTIES"; break; - case UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES: - os << "UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES"; - break; default: os << "unknown enumerator"; break; @@ -1779,12 +1759,6 @@ inline ur_result_t printStruct(std::ostream &os, const void *ptr) { (const ur_exp_enqueue_ext_properties_t *)ptr; printPtr(os, pstruct); } break; - - case UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES: { - const ur_exp_kernel_arg_properties_t *pstruct = - (const ur_exp_kernel_arg_properties_t *)ptr; - printPtr(os, pstruct); - } break; default: os << "unknown enumerator"; return UR_RESULT_ERROR_INVALID_ENUMERATION; @@ -10920,7 +10894,13 @@ printUnion(std::ostream &os, case UR_KERNEL_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: os << ".clusterDim = {"; - ur::details::printArray<3>(os, params.clusterDim); + for (auto i = 0; i < 3; i++) { + if (i != 0) { + os << ", "; + } + + os << (params.clusterDim[i]); + } os << "}"; break; @@ -11565,7 +11545,13 @@ operator<<(std::ostream &os, const struct ur_exp_sampler_addr_modes_t params) { os << ", "; os << ".addrModes = {"; - ur::details::printArray<3>(os, params.addrModes); + for (auto i = 0; i < 3; i++) { + if (i != 0) { + os << ", "; + } + + os << (params.addrModes[i]); + } os << "}"; os << "}"; @@ -12284,141 +12270,6 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, } } // namespace ur::details -/////////////////////////////////////////////////////////////////////////////// -/// @brief Print operator for the ur_exp_kernel_arg_type_t type -/// @returns -/// std::ostream & -inline std::ostream &operator<<(std::ostream &os, - enum ur_exp_kernel_arg_type_t value) { - switch (value) { - case UR_EXP_KERNEL_ARG_TYPE_VALUE: - os << "UR_EXP_KERNEL_ARG_TYPE_VALUE"; - break; - case UR_EXP_KERNEL_ARG_TYPE_POINTER: - os << "UR_EXP_KERNEL_ARG_TYPE_POINTER"; - break; - case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: - os << "UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ"; - break; - case UR_EXP_KERNEL_ARG_TYPE_LOCAL: - os << "UR_EXP_KERNEL_ARG_TYPE_LOCAL"; - break; - case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: - os << "UR_EXP_KERNEL_ARG_TYPE_SAMPLER"; - break; - default: - os << "unknown enumerator"; - break; - } - return os; -} -/////////////////////////////////////////////////////////////////////////////// -/// @brief Print operator for the ur_exp_kernel_arg_mem_obj_tuple_t type -/// @returns -/// std::ostream & -inline std::ostream & -operator<<(std::ostream &os, - const struct ur_exp_kernel_arg_mem_obj_tuple_t params) { - os << "(struct ur_exp_kernel_arg_mem_obj_tuple_t){"; - - os << ".hMem = "; - - ur::details::printPtr(os, (params.hMem)); - - os << ", "; - os << ".flags = "; - - ur::details::printFlag(os, (params.flags)); - - os << "}"; - return os; -} -namespace ur::details { - -/////////////////////////////////////////////////////////////////////////////// -// @brief Print ur_exp_kernel_arg_value_t union -inline ur_result_t printUnion(std::ostream &os, - const union ur_exp_kernel_arg_value_t params, - const enum ur_exp_kernel_arg_type_t tag) { - os << "(union ur_exp_kernel_arg_value_t){"; - - switch (tag) { - case UR_EXP_KERNEL_ARG_TYPE_VALUE: - - os << ".value = "; - - ur::details::printPtr(os, (params.value)); - - break; - case UR_EXP_KERNEL_ARG_TYPE_POINTER: - - os << ".pointer = "; - - ur::details::printPtr(os, (params.pointer)); - - break; - case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: - - os << ".memObjTuple = "; - - os << (params.memObjTuple); - - break; - case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: - - os << ".sampler = "; - - ur::details::printPtr(os, (params.sampler)); - - break; - default: - os << ""; - return UR_RESULT_ERROR_INVALID_ENUMERATION; - } - os << "}"; - return UR_RESULT_SUCCESS; -} -} // namespace ur::details -/////////////////////////////////////////////////////////////////////////////// -/// @brief Print operator for the ur_exp_kernel_arg_properties_t type -/// @returns -/// std::ostream & -inline std::ostream & -operator<<(std::ostream &os, - const struct ur_exp_kernel_arg_properties_t params) { - os << "(struct ur_exp_kernel_arg_properties_t){"; - - os << ".stype = "; - - os << (params.stype); - - os << ", "; - os << ".pNext = "; - - ur::details::printStruct(os, (params.pNext)); - - os << ", "; - os << ".type = "; - - os << (params.type); - - os << ", "; - os << ".index = "; - - os << (params.index); - - os << ", "; - os << ".size = "; - - os << (params.size); - - os << ", "; - os << ".value = "; - ur::details::printUnion(os, (params.value), params.type); - - os << "}"; - return os; -} /////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_exp_enqueue_ext_flag_t type /// @returns @@ -17068,114 +16919,6 @@ inline std::ostream &operator<<( return os; } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Print operator for the -/// ur_enqueue_kernel_launch_with_args_exp_params_t type -/// @returns -/// std::ostream & -inline std::ostream & -operator<<(std::ostream &os, [[maybe_unused]] const struct - ur_enqueue_kernel_launch_with_args_exp_params_t *params) { - - os << ".hQueue = "; - - ur::details::printPtr(os, *(params->phQueue)); - - os << ", "; - os << ".hKernel = "; - - ur::details::printPtr(os, *(params->phKernel)); - - os << ", "; - os << ".workDim = "; - - os << *(params->pworkDim); - - os << ", "; - os << ".pGlobalWorkOffset = "; - - ur::details::printPtr(os, *(params->ppGlobalWorkOffset)); - - os << ", "; - os << ".pGlobalWorkSize = "; - - ur::details::printPtr(os, *(params->ppGlobalWorkSize)); - - os << ", "; - os << ".pLocalWorkSize = "; - - ur::details::printPtr(os, *(params->ppLocalWorkSize)); - - os << ", "; - os << ".numArgs = "; - - os << *(params->pnumArgs); - - os << ", "; - os << ".pArgs = "; - ur::details::printPtr(os, reinterpret_cast(*(params->ppArgs))); - if (*(params->ppArgs) != NULL) { - os << " {"; - for (size_t i = 0; i < *params->pnumArgs; ++i) { - if (i != 0) { - os << ", "; - } - - os << (*(params->ppArgs))[i]; - } - os << "}"; - } - - os << ", "; - os << ".numPropsInLaunchPropList = "; - - os << *(params->pnumPropsInLaunchPropList); - - os << ", "; - os << ".launchPropList = "; - ur::details::printPtr( - os, reinterpret_cast(*(params->plaunchPropList))); - if (*(params->plaunchPropList) != NULL) { - os << " {"; - for (size_t i = 0; i < *params->pnumPropsInLaunchPropList; ++i) { - if (i != 0) { - os << ", "; - } - - os << (*(params->plaunchPropList))[i]; - } - os << "}"; - } - - os << ", "; - os << ".numEventsInWaitList = "; - - os << *(params->pnumEventsInWaitList); - - os << ", "; - os << ".phEventWaitList = "; - ur::details::printPtr( - os, reinterpret_cast(*(params->pphEventWaitList))); - if (*(params->pphEventWaitList) != NULL) { - os << " {"; - for (size_t i = 0; i < *params->pnumEventsInWaitList; ++i) { - if (i != 0) { - os << ", "; - } - - ur::details::printPtr(os, (*(params->pphEventWaitList))[i]); - } - os << "}"; - } - - os << ", "; - os << ".phEvent = "; - - ur::details::printPtr(os, *(params->pphEvent)); - - return os; -} - /////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the /// ur_enqueue_events_wait_with_barrier_ext_params_t type @@ -21304,25 +21047,6 @@ inline ur_result_t printPtr(std::ostream &os, const T *ptr) { return UR_RESULT_SUCCESS; } - -/////////////////////////////////////////////////////////////////////////////// -// @brief Print array of literals -template -inline ur_result_t printArray(std::ostream &os, const T *ptr) { - if (ptr == NULL) { - return printPtr(os, ptr); - } - - for (size_t i = 0; i < size; i++) { - if (i != 0) { - os << ", "; - } - - os << ptr[i]; - } - - return UR_RESULT_SUCCESS; -} } // namespace ur::details namespace ur::extras { @@ -21717,10 +21441,6 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, case UR_FUNCTION_ENQUEUE_WRITE_HOST_PIPE: { os << (const struct ur_enqueue_write_host_pipe_params_t *)params; } break; - case UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP: { - os << (const struct ur_enqueue_kernel_launch_with_args_exp_params_t *) - params; - } break; case UR_FUNCTION_ENQUEUE_EVENTS_WAIT_WITH_BARRIER_EXT: { os << (const struct ur_enqueue_events_wait_with_barrier_ext_params_t *) params; diff --git a/scripts/core/EXP-ENQUEUE-KERNEL-LAUNCH-WITH-ARGS.rst b/scripts/core/EXP-ENQUEUE-KERNEL-LAUNCH-WITH-ARGS.rst deleted file mode 100644 index 703cd1e935..0000000000 --- a/scripts/core/EXP-ENQUEUE-KERNEL-LAUNCH-WITH-ARGS.rst +++ /dev/null @@ -1,77 +0,0 @@ -<% - OneApi=tags['$OneApi'] - x=tags['$x'] - X=x.upper() -%> - -.. _experimental-enqueue-kernel-launch-with-args: - -================================================================================ -Enqueue Kernel Launch With Args -================================================================================ - -.. warning:: - - Experimental features: - - * May be replaced, updated, or removed at any time. - * Do not require maintaining API/ABI stability of their own additions over - time. - * Do not require conformance testing of their own additions. - - - -Motivation --------------------------------------------------------------------------------- - -If an application is setting a kernel's args and launching that kernel in the -same place, we can eliminate some overhead by allowing this to be accomplished -with one API call, rather than requiring one call for each argument and one to -launch. This also aligns with developments in the Level Zero backend, as well -as how CUDA and HIP handle kernel args. - -API --------------------------------------------------------------------------------- - -Enums -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* ${x}_structure_type_t - ${X}_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES - -* ${x}_exp_kernel_arg_type_t - ${X}_EXP_KERNEL_ARG_TYPE_VALUE - ${X}_EXP_KERNEL_ARG_TYPE_POINTER - ${X}_EXP_KERNEL_ARG_TYPE_MEM_OBJ - ${X}_EXP_KERNEL_ARG_TYPE_LOCAL - ${X}_EXP_KERNEL_ARG_TYPE_SAMPLER - -Types -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* ${x}_exp_kernel_arg_mem_obj_tuple_t -* ${x}_exp_kernel_arg_value_t -* ${x}_exp_kernel_arg_properties_t - -Functions -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* ${x}EnqueueKernelLaunchWithArgsExp - -Changelog --------------------------------------------------------------------------------- - -+-----------+---------------------------------------------+ -| Revision | Changes | -+===========+=============================================+ -| 1.0 | Initial Draft | -+-----------+---------------------------------------------+ - -Support --------------------------------------------------------------------------------- - -Adapters must support this feature. A naive implementation can easily be -constructed as a wrapper around the existing APIs for setting kernel args and -launching. - -Contributors --------------------------------------------------------------------------------- - -* Aaron Greig `aaron.greig@codeplay.com `_ diff --git a/scripts/core/exp-enqueue-kernel-launch-with-args.yml b/scripts/core/exp-enqueue-kernel-launch-with-args.yml deleted file mode 100644 index 6656b6a6d0..0000000000 --- a/scripts/core/exp-enqueue-kernel-launch-with-args.yml +++ /dev/null @@ -1,170 +0,0 @@ -# -# Copyright (C) 2025 Intel Corporation -# -# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. -# See LICENSE.TXT -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -# See YaML.md for syntax definition -# ---- #-------------------------------------------------------------------------- -type: header -desc: "Intel $OneApi Unified Runtime Experimental API for setting args at kernel launch" -ordinal: "100" ---- #-------------------------------------------------------------------------- -type: enum -desc: "What kind of kernel arg is this" -class: $xKernel -name: $x_exp_kernel_arg_type_t -etors: - - name: VALUE - desc: "Kernel arg is a value." - - name: POINTER - desc: "Kernel arg is a pointer." - - name: MEM_OBJ - desc: "Kernel arg is a memory object." - - name: LOCAL - desc: "Kernel arg is a local allocation." - - name: SAMPLER - desc: "Kernel arg is a sampler." ---- #-------------------------------------------------------------------------- -type: struct -desc: "Mem obj/properties tuple" -name: $x_exp_kernel_arg_mem_obj_tuple_t -members: - - type: $x_mem_handle_t - name: hMem - desc: "[in] Handle of a memory object" - - type: $x_mem_flags_t - name: flags - desc: "[in] Memory flags to associate with `hMem`. Allowed values are: $X_MEM_FLAG_READ_WRITE, $X_MEM_FLAG_WRITE_ONLY, $X_MEM_FLAG_READ_ONLY." ---- #-------------------------------------------------------------------------- -# We have redundant entries in the union (value + pointer) to make printing work -# as it relies on the tags and we can currently only have one tag per member. -type: union -desc: "Typesafe container for a kernel argument value" -name: $x_exp_kernel_arg_value_t -tag: $x_exp_kernel_arg_type_t -members: - - type: "const void*" - name: value - desc: | - [in] argument value represented as matching arg type. - The data pointed to will be copied and therefore can be reused on return. - tag: $X_EXP_KERNEL_ARG_TYPE_VALUE - - type: "const void*" - name: pointer - desc: "[in] Allocation obtained by USM allocation or virtual memory mapping operation, or pointer to a literal value." - tag: $X_EXP_KERNEL_ARG_TYPE_POINTER - - type: $x_exp_kernel_arg_mem_obj_tuple_t - name: memObjTuple - desc: "[in] Struct containing a memory object and associated flags." - tag: $X_EXP_KERNEL_ARG_TYPE_MEM_OBJ - - type: $x_sampler_handle_t - name: sampler - desc: "[in] Handle of a sampler object." - tag: $X_EXP_KERNEL_ARG_TYPE_SAMPLER ---- #-------------------------------------------------------------------------- -type: struct -desc: "Kernel arg properties" -name: $x_exp_kernel_arg_properties_t -base: $x_base_properties_t -members: - - type: $x_exp_kernel_arg_type_t - name: type - desc: "[in] type of the kernel arg" - - type: uint32_t - name: index - desc: "[in] index of the kernel arg" - - type: size_t - name: size - desc: "[in] size of the kernel arg" - - type: $x_exp_kernel_arg_value_t - name: value - desc: "[in][tagged_by(type)] Union containing the argument value." ---- #-------------------------------------------------------------------------- -type: enum -extend: true -desc: "Structure Type experimental enumerations." -name: $x_structure_type_t -etors: - - name: EXP_KERNEL_ARG_PROPERTIES - desc: $x_exp_kernel_arg_properties_t - value: "0x5000" ---- #-------------------------------------------------------------------------- -type: function -desc: "Enqueue a command to execute a kernel" -class: $xEnqueue -name: KernelLaunchWithArgsExp -ordinal: "0" -analogue: - - "**clEnqueueNDRangeKernel**" -params: - - type: $x_queue_handle_t - name: hQueue - desc: "[in] handle of the queue object" - - type: $x_kernel_handle_t - name: hKernel - desc: "[in] handle of the kernel object" - - type: uint32_t - name: workDim - desc: "[in] number of dimensions, from 1 to 3, to specify the global and work-group work-items" - - type: "const size_t*" - name: pGlobalWorkOffset - desc: "[in][optional] pointer to an array of workDim unsigned values that specify the offset used to calculate the global ID of a work-item" - - type: "const size_t*" - name: pGlobalWorkSize - desc: "[in] pointer to an array of workDim unsigned values that specify the number of global work-items in workDim that will execute the kernel function" - - type: "const size_t*" - name: pLocalWorkSize - desc: | - [in][optional] pointer to an array of workDim unsigned values that specify the number of local work-items forming a work-group that will execute the kernel function. - If nullptr, the runtime implementation will choose the work-group size. - - type: uint32_t - name: numArgs - desc: "[in] Number of entries in pArgs" - - type: "const $x_exp_kernel_arg_properties_t*" - name: pArgs - desc: "[in][optional][range(0, numArgs)] pointer to a list of kernel arg properties." - - type: uint32_t - name: numPropsInLaunchPropList - desc: "[in] size of the launch prop list" - - type: const $x_kernel_launch_property_t* - name: launchPropList - desc: "[in][optional][range(0, numPropsInLaunchPropList)] pointer to a list of launch properties" - - type: uint32_t - name: numEventsInWaitList - desc: "[in] size of the event wait list" - - type: "const $x_event_handle_t*" - name: phEventWaitList - desc: | - [in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. - If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. - - type: $x_event_handle_t* - name: phEvent - desc: | - [out][optional][alloc] return an event object that identifies this particular kernel execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. -returns: - - $X_RESULT_ERROR_INVALID_QUEUE - - $X_RESULT_ERROR_INVALID_KERNEL - - $X_RESULT_ERROR_INVALID_EVENT - - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: - - "`phEventWaitList == NULL && numEventsInWaitList > 0`" - - "`phEventWaitList != NULL && numEventsInWaitList == 0`" - - "If event objects in phEventWaitList are not valid events." - - $X_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS: - - "An event in `phEventWaitList` has $X_EVENT_STATUS_ERROR." - - $X_RESULT_ERROR_INVALID_WORK_DIMENSION: - - "`pGlobalWorkSize[0] == 0 || pGlobalWorkSize[1] == 0 || pGlobalWorkSize[2] == 0`" - - $X_RESULT_ERROR_INVALID_WORK_GROUP_SIZE: - - "`pLocalWorkSize && (pLocalWorkSize[0] == 0 || pLocalWorkSize[1] == 0 || pLocalWorkSize[2] == 0)`" - - $X_RESULT_ERROR_INVALID_VALUE - - $X_RESULT_ERROR_INVALID_KERNEL_ARGS - - "The kernel argument values have not been specified." - - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY - - $X_RESULT_ERROR_OUT_OF_RESOURCES - - $X_RESULT_ERROR_INVALID_OPERATION: - - "If any property in `launchPropList` isn't supported by the device." - - $X_RESULT_ERROR_INVALID_NULL_POINTER: - - "`launchPropList == NULL && numPropsInLaunchPropList > 0`" - - "`pArgs == NULL && numArgs > 0`" diff --git a/scripts/core/registry.yml b/scripts/core/registry.yml index 0646945b08..349ac97a27 100644 --- a/scripts/core/registry.yml +++ b/scripts/core/registry.yml @@ -670,9 +670,6 @@ etors: - name: BINDLESS_IMAGES_SUPPORTS_IMPORTING_HANDLE_TYPE_EXP desc: Enumerator for $xBindlessImagesSupportsImportingHandleTypeExp value: '288' -- name: ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP - desc: Enumerator for $xEnqueueKernelLaunchWithArgsExp - value: '289' --- type: enum desc: Defines structure types diff --git a/scripts/parse_specs.py b/scripts/parse_specs.py index 8adbc6de19..9fdb69eedc 100644 --- a/scripts/parse_specs.py +++ b/scripts/parse_specs.py @@ -909,9 +909,7 @@ def _append(lst, key, val): rets[idx][key].append(val) def append_nullchecks(param, accessor: str): - if type_traits.is_pointer(param["type"]) or type_traits.is_array( - param["type"] - ): + if type_traits.is_pointer(param["type"]): _append( rets, "$X_RESULT_ERROR_INVALID_NULL_POINTER", diff --git a/scripts/templates/helper.py b/scripts/templates/helper.py index 5b4fc0c18c..00de01e347 100644 --- a/scripts/templates/helper.py +++ b/scripts/templates/helper.py @@ -987,17 +987,12 @@ def make_param_lines( words = [] if "type*" in format: - ptname = tname + "*" - if type_traits.is_array(item["type"]): - ptname += "*" - words.append(ptname) + words.append(tname + "*") name = "p" + name elif "type" in format: words.append(tname) if "name" in format: words.append(name) - if type_traits.is_array(item["type"]) and "type" in format: - words.append(f"[{type_traits.get_array_length(item['type'])}]") prologue = " ".join(words) if "delim" in format: diff --git a/scripts/templates/print.hpp.mako b/scripts/templates/print.hpp.mako index ed94cd9227..4481847130 100644 --- a/scripts/templates/print.hpp.mako +++ b/scripts/templates/print.hpp.mako @@ -99,7 +99,14 @@ def findMemberType(_item): ${x}::details::printUnion(os, ${deref}(params${access}${item['name']}), params${access}${th.param_traits.tagged_member(item)}); %elif th.type_traits.is_array(item['type']): os << ".${iname} = {"; - ${x}::details::printArray<${th.type_traits.get_array_length(item['type'])}>(os, ${deref}params${access}${pname}); + for(auto i = 0; i < ${th.type_traits.get_array_length(item['type'])}; i++){ + if(i != 0){ + os << ", "; + } + <%call expr="member(iname, itype, True)"> + ${deref}(params${access}${item['name']}[i]) + + } os << "}"; %elif typename is not None: os << ".${iname} = "; @@ -133,7 +140,6 @@ inline constexpr bool is_handle_v = is_handle::value; template inline ${x}_result_t printPtr(std::ostream &os, const T *ptr); template inline ${x}_result_t printFlag(std::ostream &os, uint32_t flag); template inline ${x}_result_t printTagged(std::ostream &os, const void *ptr, T value, size_t size); -template inline ur_result_t printArray(std::ostream &os, const T *ptr); %for spec in specs: %for obj in spec['objects']: @@ -558,25 +564,6 @@ template inline ${x}_result_t printPtr(std::ostream &os, const T *p return ${X}_RESULT_SUCCESS; } - -/////////////////////////////////////////////////////////////////////////////// -// @brief Print array of literals -template -inline ur_result_t printArray(std::ostream &os, const T *ptr) { - if(ptr == NULL) { - return printPtr(os, ptr); - } - - for (size_t i = 0; i < size; i++) { - if (i != 0) { - os << ", "; - } - - os << ptr[i]; - } - - return ${X}_RESULT_SUCCESS; -} } // namespace ${x}::details namespace ${x}::extras { diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 091e8e9d53..8eb00ccab2 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -15,7 +15,6 @@ #include "kernel.hpp" #include "memory.hpp" #include "queue.hpp" -#include "sampler.hpp" #include #include @@ -620,60 +619,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( #endif // CUDA_VERSION >= 11080 } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( - ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numArgs, - const ur_exp_kernel_arg_properties_t *pArgs, - uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - try { - for (uint32_t i = 0; i < numArgs; i++) { - switch (pArgs[i].type) { - case UR_EXP_KERNEL_ARG_TYPE_LOCAL: { - hKernel->setKernelLocalArg(pArgs[i].index, pArgs[i].size); - break; - } - case UR_EXP_KERNEL_ARG_TYPE_VALUE: { - hKernel->setKernelArg(pArgs[i].index, pArgs[i].size, - pArgs[i].value.value); - break; - } - case UR_EXP_KERNEL_ARG_TYPE_POINTER: { - // setKernelArg is expecting a pointer to our argument - hKernel->setKernelArg(pArgs[i].index, pArgs[i].size, - &pArgs[i].value.pointer); - break; - } - case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: { - ur_kernel_arg_mem_obj_properties_t Props = { - UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES, nullptr, - pArgs[i].value.memObjTuple.flags}; - UR_CALL(urKernelSetArgMemObj(hKernel, pArgs[i].index, &Props, - pArgs[i].value.memObjTuple.hMem)); - break; - } - case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: { - uint32_t SamplerProps = pArgs[i].value.sampler->Props; - hKernel->setKernelArg(pArgs[i].index, sizeof(uint32_t), - (void *)&SamplerProps); - break; - } - default: - return UR_RESULT_ERROR_INVALID_ENUMERATION; - } - } - } catch (ur_result_t Err) { - return Err; - } - return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, - pGlobalWorkSize, pLocalWorkSize, - numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent); -} - /// Set parameters for general 3D memory copy. /// If the source and/or destination is on the device, SrcPtr and/or DstPtr /// must be a pointer to a CUdeviceptr diff --git a/source/adapters/cuda/ur_interface_loader.cpp b/source/adapters/cuda/ur_interface_loader.cpp index a9b072472b..8430df0ab0 100644 --- a/source/adapters/cuda/ur_interface_loader.cpp +++ b/source/adapters/cuda/ur_interface_loader.cpp @@ -454,7 +454,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnUSMHostAllocExp = urEnqueueUSMHostAllocExp; pDdiTable->pfnUSMFreeExp = urEnqueueUSMFreeExp; pDdiTable->pfnCommandBufferExp = urEnqueueCommandBufferExp; - pDdiTable->pfnKernelLaunchWithArgsExp = urEnqueueKernelLaunchWithArgsExp; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp index 54ea1ca91a..dc0fac8050 100644 --- a/source/adapters/hip/enqueue.cpp +++ b/source/adapters/hip/enqueue.cpp @@ -16,7 +16,6 @@ #include "logger/ur_logger.hpp" #include "memory.hpp" #include "queue.hpp" -#include "sampler.hpp" #include "ur_api.h" #include @@ -341,60 +340,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( - ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numArgs, - const ur_exp_kernel_arg_properties_t *pArgs, - uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - try { - for (uint32_t i = 0; i < numArgs; i++) { - switch (pArgs[i].type) { - case UR_EXP_KERNEL_ARG_TYPE_LOCAL: { - hKernel->setKernelLocalArg(pArgs[i].index, pArgs[i].size); - break; - } - case UR_EXP_KERNEL_ARG_TYPE_VALUE: { - hKernel->setKernelArg(pArgs[i].index, pArgs[i].size, - pArgs[i].value.value); - break; - } - case UR_EXP_KERNEL_ARG_TYPE_POINTER: { - // setKernelArg is expecting a pointer to our argument - hKernel->setKernelArg(pArgs[i].index, pArgs[i].size, - &pArgs[i].value.pointer); - break; - } - case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: { - ur_kernel_arg_mem_obj_properties_t Props = { - UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES, nullptr, - pArgs[i].value.memObjTuple.flags}; - UR_CALL(urKernelSetArgMemObj(hKernel, pArgs[i].index, &Props, - pArgs[i].value.memObjTuple.hMem)); - break; - } - case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: { - uint32_t SamplerProps = pArgs[i].value.sampler->Props; - hKernel->setKernelArg(pArgs[i].index, sizeof(uint32_t), - (void *)&SamplerProps); - break; - } - default: - return UR_RESULT_ERROR_INVALID_ENUMERATION; - } - } - } catch (ur_result_t Err) { - return Err; - } - return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, - pGlobalWorkSize, pLocalWorkSize, - numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent); -} - /// Enqueues a wait on the given queue for all events. /// See \ref enqueueEventWait /// diff --git a/source/adapters/hip/ur_interface_loader.cpp b/source/adapters/hip/ur_interface_loader.cpp index d8ec6bb3b5..dfb4382cad 100644 --- a/source/adapters/hip/ur_interface_loader.cpp +++ b/source/adapters/hip/ur_interface_loader.cpp @@ -447,7 +447,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; pDdiTable->pfnNativeCommandExp = urEnqueueNativeCommandExp; pDdiTable->pfnCommandBufferExp = urEnqueueCommandBufferExp; - pDdiTable->pfnKernelLaunchWithArgsExp = urEnqueueKernelLaunchWithArgsExp; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp index 06d1366a11..b6d3d2e64c 100644 --- a/source/adapters/level_zero/kernel.cpp +++ b/source/adapters/level_zero/kernel.cpp @@ -56,173 +56,6 @@ ur_result_t urKernelGetSuggestedLocalWorkSize( return UR_RESULT_SUCCESS; } -inline ur_result_t KernelSetArgValueHelper( - ur_kernel_handle_t Kernel, - /// [in] argument index in range [0, num args - 1] - uint32_t ArgIndex, - /// [in] size of argument type - size_t ArgSize, - /// [in] argument value represented as matching arg type. - const void *PArgValue) { - // OpenCL: "the arg_value pointer can be NULL or point to a NULL value - // in which case a NULL value will be used as the value for the argument - // declared as a pointer to global or constant memory in the kernel" - // - // We don't know the type of the argument but it seems that the only time - // SYCL RT would send a pointer to NULL in 'arg_value' is when the argument - // is a NULL pointer. Treat a pointer to NULL in 'arg_value' as a NULL. - if (ArgSize == sizeof(void *) && PArgValue && - *(void **)(const_cast(PArgValue)) == nullptr) { - PArgValue = nullptr; - } - - if (ArgIndex > Kernel->ZeKernelProperties->numKernelArgs - 1) { - return UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX; - } - - ze_result_t ZeResult = ZE_RESULT_SUCCESS; - if (Kernel->ZeKernelMap.empty()) { - auto ZeKernel = Kernel->ZeKernel; - ZeResult = ZE_CALL_NOCHECK(zeKernelSetArgumentValue, - (ZeKernel, ArgIndex, ArgSize, PArgValue)); - } else { - for (auto It : Kernel->ZeKernelMap) { - auto ZeKernel = It.second; - ZeResult = ZE_CALL_NOCHECK(zeKernelSetArgumentValue, - (ZeKernel, ArgIndex, ArgSize, PArgValue)); - } - } - - if (ZeResult == ZE_RESULT_ERROR_INVALID_ARGUMENT) { - return UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE; - } - - return ze2urResult(ZeResult); -} - -inline ur_result_t KernelSetArgMemObjHelper( - /// [in] handle of the kernel object - ur_kernel_handle_t Kernel, - /// [in] argument index in range [0, num args - 1] - uint32_t ArgIndex, - /// [in][optional] pointer to Memory object properties. - const ur_kernel_arg_mem_obj_properties_t *Properties, - /// [in][optional] handle of Memory object. - ur_mem_handle_t ArgValue) { - // The ArgValue may be a NULL pointer in which case a NULL value is used for - // the kernel argument declared as a pointer to global or constant memory. - - if (ArgIndex > Kernel->ZeKernelProperties->numKernelArgs - 1) { - return UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX; - } - - ur_mem_handle_t_ *UrMem = ur_cast(ArgValue); - - ur_mem_handle_t_::access_mode_t UrAccessMode = ur_mem_handle_t_::read_write; - if (Properties) { - switch (Properties->memoryAccess) { - case UR_MEM_FLAG_READ_WRITE: - UrAccessMode = ur_mem_handle_t_::read_write; - break; - case UR_MEM_FLAG_WRITE_ONLY: - UrAccessMode = ur_mem_handle_t_::write_only; - break; - case UR_MEM_FLAG_READ_ONLY: - UrAccessMode = ur_mem_handle_t_::read_only; - break; - case 0: - break; - default: - return UR_RESULT_ERROR_INVALID_ARGUMENT; - } - } - auto Arg = UrMem ? UrMem : nullptr; - Kernel->PendingArguments.push_back( - {ArgIndex, sizeof(void *), Arg, UrAccessMode}); - - return UR_RESULT_SUCCESS; -} - -ur_result_t urEnqueueKernelLaunchWithArgsExp( - /// [in] handle of the queue object - ur_queue_handle_t Queue, - /// [in] handle of the kernel object - ur_kernel_handle_t Kernel, - /// [in] number of dimensions, from 1 to 3, to specify the global and - /// work-group work-items - uint32_t workDim, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the offset used to calculate the global ID of a work-item - const size_t *GlobalWorkOffset, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of global work-items in workDim that will execute the kernel - /// function - const size_t *GlobalWorkSize, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the number of local work-items forming a work-group that will - /// execute the kernel function. - /// If nullptr, the runtime implementation will choose the work-group size. - const size_t *LocalWorkSize, - /// [in] size of the event wait list - uint32_t NumArgs, - /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg - /// properties. - const ur_exp_kernel_arg_properties_t *Args, - /// [in] size of the launch prop list - uint32_t NumPropsInLaunchPropList, - /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch - /// properties - const ur_kernel_launch_property_t *LaunchPropList, - uint32_t NumEventsInWaitList, - /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of - /// events that must be complete before the kernel execution. If - /// nullptr, the numEventsInWaitList must be 0, indicating that no wait - /// event. - const ur_event_handle_t *EventWaitList, - /// [in,out][optional] return an event object that identifies this - /// particular kernel execution instance. - ur_event_handle_t *OutEvent) { - { - std::scoped_lock Guard(Kernel->Mutex); - for (uint32_t i = 0; i < NumArgs; i++) { - switch (Args[i].type) { - case UR_EXP_KERNEL_ARG_TYPE_LOCAL: - UR_CALL(KernelSetArgValueHelper(Kernel, Args[i].index, Args[i].size, - nullptr)); - break; - case UR_EXP_KERNEL_ARG_TYPE_VALUE: - UR_CALL(KernelSetArgValueHelper(Kernel, Args[i].index, Args[i].size, - Args[i].value.value)); - break; - case UR_EXP_KERNEL_ARG_TYPE_POINTER: - UR_CALL(KernelSetArgValueHelper(Kernel, Args[i].index, Args[i].size, - &Args[i].value.pointer)); - break; - case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: { - ur_kernel_arg_mem_obj_properties_t Properties = { - UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES, nullptr, - Args[i].value.memObjTuple.flags}; - UR_CALL(KernelSetArgMemObjHelper(Kernel, Args[i].index, &Properties, - Args[i].value.memObjTuple.hMem)); - break; - } - case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: { - UR_CALL(KernelSetArgValueHelper(Kernel, Args[i].index, Args[i].size, - &Args[i].value.sampler->ZeSampler)); - break; - } - default: - return UR_RESULT_ERROR_INVALID_ENUMERATION; - } - } - } - // Normalize so each dimension has at least one work item - return level_zero::urEnqueueKernelLaunch( - Queue, Kernel, workDim, GlobalWorkOffset, GlobalWorkSize, LocalWorkSize, - NumPropsInLaunchPropList, LaunchPropList, NumEventsInWaitList, - EventWaitList, OutEvent); -} - ur_result_t urEnqueueKernelLaunch( /// [in] handle of the queue object ur_queue_handle_t Queue, diff --git a/source/adapters/level_zero/ur_interface_loader.cpp b/source/adapters/level_zero/ur_interface_loader.cpp index 4276f97f5d..13d7274e7a 100644 --- a/source/adapters/level_zero/ur_interface_loader.cpp +++ b/source/adapters/level_zero/ur_interface_loader.cpp @@ -225,8 +225,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( return result; } - pDdiTable->pfnKernelLaunchWithArgsExp = - ur::level_zero::urEnqueueKernelLaunchWithArgsExp; pDdiTable->pfnUSMDeviceAllocExp = ur::level_zero::urEnqueueUSMDeviceAllocExp; pDdiTable->pfnUSMSharedAllocExp = ur::level_zero::urEnqueueUSMSharedAllocExp; pDdiTable->pfnUSMHostAllocExp = ur::level_zero::urEnqueueUSMHostAllocExp; diff --git a/source/adapters/level_zero/ur_interface_loader.hpp b/source/adapters/level_zero/ur_interface_loader.hpp index b0d683e7a5..df8e93c1f7 100644 --- a/source/adapters/level_zero/ur_interface_loader.hpp +++ b/source/adapters/level_zero/ur_interface_loader.hpp @@ -804,15 +804,6 @@ ur_result_t urUsmP2PPeerAccessGetInfoExp(ur_device_handle_t commandDevice, ur_exp_peer_info_t propName, size_t propSize, void *pPropValue, size_t *pPropSizeRet); -ur_result_t urEnqueueKernelLaunchWithArgsExp( - ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numArgs, - const ur_exp_kernel_arg_properties_t *pArgs, - uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent); ur_result_t urEnqueueEventsWaitWithBarrierExt( ur_queue_handle_t hQueue, const ur_exp_enqueue_ext_properties_t *pProperties, diff --git a/source/adapters/level_zero/v2/command_list_manager.cpp b/source/adapters/level_zero/v2/command_list_manager.cpp index 04e202265d..728db1360b 100644 --- a/source/adapters/level_zero/v2/command_list_manager.cpp +++ b/source/adapters/level_zero/v2/command_list_manager.cpp @@ -11,7 +11,6 @@ #include "command_list_manager.hpp" #include "../helpers/kernel_helpers.hpp" #include "../helpers/memory_helpers.hpp" -#include "../sampler.hpp" #include "../ur_interface_loader.hpp" #include "command_buffer.hpp" #include "context.hpp" @@ -976,60 +975,3 @@ ur_result_t ur_command_list_manager::releaseSubmittedKernels() { submittedKernels.clear(); return UR_RESULT_SUCCESS; } - -ur_result_t ur_command_list_manager::appendKernelLaunchWithArgsExp( - ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numArgs, - const ur_exp_kernel_arg_properties_t *pArgs, - uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent) { - TRACK_SCOPE_LATENCY( - "ur_queue_immediate_in_order_t::enqueueKernelLaunchWithArgsExp"); - { - std::scoped_lock guard(hKernel->Mutex); - for (uint32_t argIndex = 0; argIndex < numArgs; argIndex++) { - switch (pArgs[argIndex].type) { - case UR_EXP_KERNEL_ARG_TYPE_LOCAL: - UR_CALL(hKernel->setArgValue(pArgs[argIndex].index, - pArgs[argIndex].size, nullptr, nullptr)); - break; - case UR_EXP_KERNEL_ARG_TYPE_VALUE: - UR_CALL(hKernel->setArgValue(pArgs[argIndex].index, - pArgs[argIndex].size, nullptr, - pArgs[argIndex].value.value)); - break; - case UR_EXP_KERNEL_ARG_TYPE_POINTER: - UR_CALL(hKernel->setArgPointer(pArgs[argIndex].index, nullptr, - pArgs[argIndex].value.pointer)); - break; - case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: - // TODO: import helper for converting ur flags to internal equivalent - UR_CALL(hKernel->addPendingMemoryAllocation( - {pArgs[argIndex].value.memObjTuple.hMem, - ur_mem_buffer_t::device_access_mode_t::read_write, - pArgs[argIndex].index})); - break; - case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: { - UR_CALL( - hKernel->setArgValue(argIndex, sizeof(void *), nullptr, - &pArgs[argIndex].value.sampler->ZeSampler)); - break; - } - default: - return UR_RESULT_ERROR_INVALID_ENUMERATION; - } - } - } - - UR_CALL(appendKernelLaunch(hKernel, workDim, pGlobalWorkOffset, - pGlobalWorkSize, pLocalWorkSize, - numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent)); - - recordSubmittedKernel(hKernel); - - return UR_RESULT_SUCCESS; -} diff --git a/source/adapters/level_zero/v2/command_list_manager.hpp b/source/adapters/level_zero/v2/command_list_manager.hpp index 2a18f9b919..a7eafa8f9c 100644 --- a/source/adapters/level_zero/v2/command_list_manager.hpp +++ b/source/adapters/level_zero/v2/command_list_manager.hpp @@ -231,16 +231,6 @@ struct ur_command_list_manager { const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent); - ur_result_t appendKernelLaunchWithArgsExp( - ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numArgs, - const ur_exp_kernel_arg_properties_t *pArgs, - uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent); - private: ur_result_t appendGenericCommandListsExp( uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, diff --git a/source/adapters/level_zero/v2/kernel.cpp b/source/adapters/level_zero/v2/kernel.cpp index f48a41154e..173b51ffc4 100644 --- a/source/adapters/level_zero/v2/kernel.cpp +++ b/source/adapters/level_zero/v2/kernel.cpp @@ -417,17 +417,19 @@ ur_result_t urKernelSetArgPointer( return exceptionToResult(std::current_exception()); } -static ur_mem_buffer_t::device_access_mode_t -memAccessFromKernelProperties(const ur_mem_flags_t &Flags) { - switch (Flags) { - case UR_MEM_FLAG_READ_WRITE: - return ur_mem_buffer_t::device_access_mode_t::read_write; - case UR_MEM_FLAG_WRITE_ONLY: - return ur_mem_buffer_t::device_access_mode_t::write_only; - case UR_MEM_FLAG_READ_ONLY: - return ur_mem_buffer_t::device_access_mode_t::read_only; - default: - return ur_mem_buffer_t::device_access_mode_t::read_write; +static ur_mem_buffer_t::device_access_mode_t memAccessFromKernelProperties( + const ur_kernel_arg_mem_obj_properties_t *pProperties) { + if (pProperties) { + switch (pProperties->memoryAccess) { + case UR_MEM_FLAG_READ_WRITE: + return ur_mem_buffer_t::device_access_mode_t::read_write; + case UR_MEM_FLAG_WRITE_ONLY: + return ur_mem_buffer_t::device_access_mode_t::write_only; + case UR_MEM_FLAG_READ_ONLY: + return ur_mem_buffer_t::device_access_mode_t::read_only; + default: + return ur_mem_buffer_t::device_access_mode_t::read_write; + } } return ur_mem_buffer_t::device_access_mode_t::read_write; } @@ -441,10 +443,7 @@ urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex, std::scoped_lock guard(hKernel->Mutex); UR_CALL(hKernel->addPendingMemoryAllocation( - {hArgValue, - memAccessFromKernelProperties(pProperties ? pProperties->memoryAccess - : 0), - argIndex})); + {hArgValue, memAccessFromKernelProperties(pProperties), argIndex})); return UR_RESULT_SUCCESS; } catch (...) { diff --git a/source/adapters/level_zero/v2/queue_api.cpp b/source/adapters/level_zero/v2/queue_api.cpp index 660ed54406..d043a68dca 100644 --- a/source/adapters/level_zero/v2/queue_api.cpp +++ b/source/adapters/level_zero/v2/queue_api.cpp @@ -440,22 +440,6 @@ ur_result_t urEnqueueTimestampRecordingExp( } catch (...) { return exceptionToResult(std::current_exception()); } -ur_result_t urEnqueueKernelLaunchWithArgsExp( - ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numArgs, - const ur_exp_kernel_arg_properties_t *pArgs, - uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) try { - return hQueue->get().enqueueKernelLaunchWithArgsExp( - hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, - numArgs, pArgs, numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent); -} catch (...) { - return exceptionToResult(std::current_exception()); -} ur_result_t urEnqueueEventsWaitWithBarrierExt( ur_queue_handle_t hQueue, const ur_exp_enqueue_ext_properties_t *pProperties, diff --git a/source/adapters/level_zero/v2/queue_api.hpp b/source/adapters/level_zero/v2/queue_api.hpp index 47425c5772..b710f9d56b 100644 --- a/source/adapters/level_zero/v2/queue_api.hpp +++ b/source/adapters/level_zero/v2/queue_api.hpp @@ -162,11 +162,6 @@ struct ur_queue_t_ { virtual ur_result_t enqueueTimestampRecordingExp(bool, uint32_t, const ur_event_handle_t *, ur_event_handle_t *) = 0; - virtual ur_result_t enqueueKernelLaunchWithArgsExp( - ur_kernel_handle_t, uint32_t, const size_t *, const size_t *, - const size_t *, uint32_t, const ur_exp_kernel_arg_properties_t *, - uint32_t, const ur_kernel_launch_property_t *, uint32_t, - const ur_event_handle_t *, ur_event_handle_t *) = 0; virtual ur_result_t enqueueEventsWaitWithBarrierExt(const ur_exp_enqueue_ext_properties_t *, uint32_t, const ur_event_handle_t *, diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp index 7b5f36da47..74b37d1b40 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp @@ -453,22 +453,6 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { createEventIfRequested(eventPool.get(), phEvent, this)); } - ur_result_t enqueueKernelLaunchWithArgsExp( - ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numArgs, - const ur_exp_kernel_arg_properties_t *pArgs, - uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override { - return commandListManager.lock()->appendKernelLaunchWithArgsExp( - hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, - numArgs, pArgs, numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, - createEventIfRequested(eventPool.get(), phEvent, this)); - } - ur::RefCount RefCount; }; diff --git a/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp b/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp index 5712375a84..07e8743154 100644 --- a/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp +++ b/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp @@ -505,24 +505,6 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { createEventIfRequested(eventPool.get(), phEvent, this)); } - ur_result_t enqueueKernelLaunchWithArgsExp( - ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numArgs, - const ur_exp_kernel_arg_properties_t *pArgs, - uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override { - auto commandListId = getNextCommandListId(); - return commandListManagers.lock()[commandListId] - .appendKernelLaunchWithArgsExp( - hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numArgs, pArgs, numPropsInLaunchPropList, - launchPropList, numEventsInWaitList, phEventWaitList, - createEventIfRequested(eventPool.get(), phEvent, this)); - } - ur::RefCount RefCount; }; diff --git a/source/adapters/mock/ur_mockddi.cpp b/source/adapters/mock/ur_mockddi.cpp index 74cb1accfa..39d67fff43 100644 --- a/source/adapters/mock/ur_mockddi.cpp +++ b/source/adapters/mock/ur_mockddi.cpp @@ -11915,107 +11915,6 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( return exceptionToResult(std::current_exception()); } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Intercept function for urEnqueueKernelLaunchWithArgsExp -__urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( - /// [in] handle of the queue object - ur_queue_handle_t hQueue, - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] number of dimensions, from 1 to 3, to specify the global and - /// work-group work-items - uint32_t workDim, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the offset used to calculate the global ID of a work-item - const size_t *pGlobalWorkOffset, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of global work-items in workDim that will execute the kernel - /// function - const size_t *pGlobalWorkSize, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the number of local work-items forming a work-group that will - /// execute the kernel function. - /// If nullptr, the runtime implementation will choose the work-group size. - const size_t *pLocalWorkSize, - /// [in] Number of entries in pArgs - uint32_t numArgs, - /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg - /// properties. - const ur_exp_kernel_arg_properties_t *pArgs, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list - /// of launch properties - const ur_kernel_launch_property_t *launchPropList, - /// [in] size of the event wait list - uint32_t numEventsInWaitList, - /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of - /// events that must be complete before the kernel execution. - /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait - /// event. - const ur_event_handle_t *phEventWaitList, - /// [out][optional][alloc] return an event object that identifies this - /// particular kernel execution instance. If phEventWaitList and phEvent - /// are not NULL, phEvent must not refer to an element of the - /// phEventWaitList array. - ur_event_handle_t *phEvent) try { - ur_result_t result = UR_RESULT_SUCCESS; - - ur_enqueue_kernel_launch_with_args_exp_params_t params = { - &hQueue, - &hKernel, - &workDim, - &pGlobalWorkOffset, - &pGlobalWorkSize, - &pLocalWorkSize, - &numArgs, - &pArgs, - &numPropsInLaunchPropList, - &launchPropList, - &numEventsInWaitList, - &phEventWaitList, - &phEvent}; - - auto beforeCallback = reinterpret_cast( - mock::getCallbacks().get_before_callback( - "urEnqueueKernelLaunchWithArgsExp")); - if (beforeCallback) { - result = beforeCallback(¶ms); - if (result != UR_RESULT_SUCCESS) { - return result; - } - } - - auto replaceCallback = reinterpret_cast( - mock::getCallbacks().get_replace_callback( - "urEnqueueKernelLaunchWithArgsExp")); - if (replaceCallback) { - result = replaceCallback(¶ms); - } else { - - // optional output handle - if (phEvent) { - *phEvent = mock::createDummyHandle(); - } - result = UR_RESULT_SUCCESS; - } - - if (result != UR_RESULT_SUCCESS) { - return result; - } - - auto afterCallback = reinterpret_cast( - mock::getCallbacks().get_after_callback( - "urEnqueueKernelLaunchWithArgsExp")); - if (afterCallback) { - return afterCallback(¶ms); - } - - return result; -} catch (...) { - return exceptionToResult(std::current_exception()); -} - /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urEnqueueEventsWaitWithBarrierExt __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrierExt( @@ -12521,9 +12420,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( ur_result_t result = UR_RESULT_SUCCESS; - pDdiTable->pfnKernelLaunchWithArgsExp = - driver::urEnqueueKernelLaunchWithArgsExp; - pDdiTable->pfnUSMDeviceAllocExp = driver::urEnqueueUSMDeviceAllocExp; pDdiTable->pfnUSMSharedAllocExp = driver::urEnqueueUSMSharedAllocExp; diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp index 4c780031f8..5fecdc5b8f 100644 --- a/source/adapters/native_cpu/enqueue.cpp +++ b/source/adapters/native_cpu/enqueue.cpp @@ -621,45 +621,3 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueNativeCommandExp( const ur_event_handle_t *, ur_event_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } - -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( - ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numArgs, - const ur_exp_kernel_arg_properties_t *pArgs, - uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - for (uint32_t argIndex = 0; argIndex < numArgs; argIndex++) { - switch (pArgs[argIndex].type) { - case UR_EXP_KERNEL_ARG_TYPE_VALUE: - UR_CALL(hKernel->addArg(pArgs[argIndex].value.value, - pArgs[argIndex].index, pArgs[argIndex].size)); - break; - case UR_EXP_KERNEL_ARG_TYPE_POINTER: - UR_CALL( - hKernel->addPtrArg(const_cast(pArgs[argIndex].value.pointer), - pArgs[argIndex].index)); - break; - case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: { - auto MemObj = pArgs[argIndex].value.memObjTuple.hMem; - UR_CALL(hKernel->addMemObjArg(MemObj, pArgs[argIndex].index)); - break; - } - case UR_EXP_KERNEL_ARG_TYPE_LOCAL: - UR_CALL( - hKernel->addLocalArg(pArgs[argIndex].index, pArgs[argIndex].size)); - break; - case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - break; - default: - return UR_RESULT_ERROR_INVALID_ENUMERATION; - } - } - return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, - pGlobalWorkSize, pLocalWorkSize, - numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent); -} diff --git a/source/adapters/native_cpu/kernel.cpp b/source/adapters/native_cpu/kernel.cpp index f67c7653d0..ac11331357 100644 --- a/source/adapters/native_cpu/kernel.cpp +++ b/source/adapters/native_cpu/kernel.cpp @@ -61,14 +61,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue( // TODO: error checking UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(argSize, UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE); - return hKernel->addArg(pArgValue, argIndex, argSize); + hKernel->addArg(pArgValue, argIndex, argSize); + + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgLocal( ur_kernel_handle_t hKernel, uint32_t argIndex, size_t argSize, const ur_kernel_arg_local_properties_t * /*pProperties*/) { - return hKernel->addLocalArg(argIndex, argSize); + // emplace a placeholder kernel arg, gets replaced with a pointer to the + // memory pool before enqueueing the kernel. + hKernel->addPtrArg(nullptr, argIndex); + hKernel->_localArgInfo.emplace_back(argIndex, argSize); + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel, @@ -204,8 +211,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer( const void *pArgValue) { UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(pArgValue, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + hKernel->addPtrArg(const_cast(pArgValue), argIndex); - return hKernel->addPtrArg(const_cast(pArgValue), argIndex); + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL @@ -232,7 +242,16 @@ urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex, UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - return hKernel->addMemObjArg(hArgValue, argIndex); + // Taken from ur/adapters/cuda/kernel.cpp + // zero-sized buffers are expected to be null. + if (hArgValue == nullptr) { + hKernel->addPtrArg(nullptr, argIndex); + return UR_RESULT_SUCCESS; + } + + hKernel->addArgReference(hArgValue); + hKernel->addPtrArg(hArgValue->_mem, argIndex); + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urKernelSetSpecializationConstants( diff --git a/source/adapters/native_cpu/kernel.hpp b/source/adapters/native_cpu/kernel.hpp index 285b1c00d0..8daf23feb6 100644 --- a/source/adapters/native_cpu/kernel.hpp +++ b/source/adapters/native_cpu/kernel.hpp @@ -181,44 +181,17 @@ struct ur_kernel_handle_t_ : RefCounted { return Result; } - inline ur_result_t addArg(const void *Ptr, size_t Index, size_t Size) { - UR_ASSERT(Size, UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE); + void addArg(const void *Ptr, size_t Index, size_t Size) { Args.addArg(Index, Size, Ptr); - return UR_RESULT_SUCCESS; } - inline ur_result_t addPtrArg(void *Ptr, size_t Index) { - UR_ASSERT(Ptr, UR_RESULT_ERROR_INVALID_NULL_POINTER); - Args.addPtrArg(Index, Ptr); - return UR_RESULT_SUCCESS; - } + void addPtrArg(void *Ptr, size_t Index) { Args.addPtrArg(Index, Ptr); } void addArgReference(ur_mem_handle_t Arg) { Arg->incrementReferenceCount(); ReferencedArgs.push_back(Arg); } - inline ur_result_t addMemObjArg(ur_mem_handle_t ArgValue, size_t Index) { - // Taken from ur/adapters/cuda/kernel.cpp - // zero-sized buffers are expected to be null. - if (ArgValue == nullptr) { - addPtrArg(nullptr, Index); - return UR_RESULT_SUCCESS; - } - - addArgReference(ArgValue); - addPtrArg(ArgValue->_mem, Index); - return UR_RESULT_SUCCESS; - } - - inline ur_result_t addLocalArg(size_t Index, size_t Size) { - // emplace a placeholder kernel arg, gets replaced with a pointer to the - // memory pool before enqueueing the kernel. - Args.addPtrArg(Index, nullptr); - _localArgInfo.emplace_back(Index, Size); - return UR_RESULT_SUCCESS; - } - private: void removeArgReferences() { for (auto arg : ReferencedArgs) diff --git a/source/adapters/native_cpu/ur_interface_loader.cpp b/source/adapters/native_cpu/ur_interface_loader.cpp index 7bc2f999a0..3f6fe061b4 100644 --- a/source/adapters/native_cpu/ur_interface_loader.cpp +++ b/source/adapters/native_cpu/ur_interface_loader.cpp @@ -431,7 +431,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; pDdiTable->pfnNativeCommandExp = urEnqueueNativeCommandExp; pDdiTable->pfnCommandBufferExp = urEnqueueCommandBufferExp; - pDdiTable->pfnKernelLaunchWithArgsExp = urEnqueueKernelLaunchWithArgsExp; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/offload/enqueue.cpp b/source/adapters/offload/enqueue.cpp index cd89280c5a..b1a1edac52 100644 --- a/source/adapters/offload/enqueue.cpp +++ b/source/adapters/offload/enqueue.cpp @@ -274,40 +274,3 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( return Result; } - -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( - ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numArgs, - const ur_exp_kernel_arg_properties_t *pArgs, - uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - for (uint32_t i = 0; i < numArgs; i++) { - switch (pArgs[i].type) { - case UR_EXP_KERNEL_ARG_TYPE_POINTER: - hKernel->Args.addArg(pArgs[i].index, sizeof(pArgs[i].value.pointer), - &pArgs[i].value.pointer); - break; - case UR_EXP_KERNEL_ARG_TYPE_VALUE: - hKernel->Args.addArg(pArgs[i].index, pArgs[i].size, pArgs[i].value.value); - break; - case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: - hKernel->Args.addMemObjArg(pArgs[i].index, - pArgs[i].value.memObjTuple.hMem, - pArgs[i].value.memObjTuple.flags); - break; - case UR_EXP_KERNEL_ARG_TYPE_LOCAL: - case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - default: - return UR_RESULT_ERROR_INVALID_ENUMERATION; - } - } - - return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, - pGlobalWorkSize, pLocalWorkSize, - numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent); -} diff --git a/source/adapters/offload/kernel.cpp b/source/adapters/offload/kernel.cpp index 02a7ee3a3f..58c4f6cf7f 100644 --- a/source/adapters/offload/kernel.cpp +++ b/source/adapters/offload/kernel.cpp @@ -9,6 +9,7 @@ //===----------------------------------------------------------------------===// #include "kernel.hpp" +#include "memory.hpp" #include "program.hpp" #include "ur2offload.hpp" #include @@ -87,11 +88,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex, const ur_kernel_arg_mem_obj_properties_t *Properties, ur_mem_handle_t hArgValue) { + // Handle zero-sized buffers + if (hArgValue == nullptr) { + hKernel->Args.addArg(argIndex, 0, nullptr); + return UR_RESULT_SUCCESS; + } + ur_mem_flags_t MemAccess = Properties ? Properties->memoryAccess : static_cast(UR_MEM_FLAG_READ_WRITE); hKernel->Args.addMemObjArg(argIndex, hArgValue, MemAccess); + auto Ptr = std::get(hArgValue->Mem).Ptr; + hKernel->Args.addArg(argIndex, sizeof(void *), &Ptr); return UR_RESULT_SUCCESS; } diff --git a/source/adapters/offload/kernel.hpp b/source/adapters/offload/kernel.hpp index a5e7f16f2b..83866b5974 100644 --- a/source/adapters/offload/kernel.hpp +++ b/source/adapters/offload/kernel.hpp @@ -18,7 +18,6 @@ #include #include "common.hpp" -#include "memory.hpp" struct ur_kernel_handle_t_ : RefCounted { @@ -57,12 +56,7 @@ struct ur_kernel_handle_t_ : RefCounted { } void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) { - // Handle zero-sized buffers - if (hMem == nullptr) { - addArg(Index, 0, nullptr); - return; - } - + assert(hMem && "Invalid mem handle"); // If a memobj is already set at this index, update the entry rather // than adding a duplicate one for (auto &Arg : MemObjArgs) { @@ -72,9 +66,6 @@ struct ur_kernel_handle_t_ : RefCounted { } } MemObjArgs.push_back(MemObjArg{hMem, Index, Flags}); - - auto Ptr = std::get(hMem->Mem).Ptr; - addArg(Index, sizeof(void *), &Ptr); } const args_ptr_t &getPointers() const noexcept { return Pointers; } diff --git a/source/adapters/offload/ur_interface_loader.cpp b/source/adapters/offload/ur_interface_loader.cpp index e8a4fa60f7..02de9df99f 100644 --- a/source/adapters/offload/ur_interface_loader.cpp +++ b/source/adapters/offload/ur_interface_loader.cpp @@ -383,7 +383,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnTimestampRecordingExp = nullptr; pDdiTable->pfnNativeCommandExp = nullptr; - pDdiTable->pfnKernelLaunchWithArgsExp = urEnqueueKernelLaunchWithArgsExp; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/opencl/enqueue.cpp b/source/adapters/opencl/enqueue.cpp index 1bd75b6b56..63b7b45426 100644 --- a/source/adapters/opencl/enqueue.cpp +++ b/source/adapters/opencl/enqueue.cpp @@ -16,10 +16,6 @@ #include "memory.hpp" #include "program.hpp" #include "queue.hpp" -#include "sampler.hpp" - -#include -#include cl_map_flags convertURMapFlagsToCL(ur_map_flags_t URFlags) { cl_map_flags CLFlags = 0; @@ -505,102 +501,3 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueWriteHostPipe( return UR_RESULT_SUCCESS; } - -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( - ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numArgs, - const ur_exp_kernel_arg_properties_t *pArgs, - uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - for (uint32_t propIndex = 0; propIndex < numPropsInLaunchPropList; - propIndex++) { - // Adapters that don't support cooperative kernels are currently expected - // to ignore COOPERATIVE launch properties. Ideally we should avoid passing - // these at the SYCL RT level instead, see - // https://github.com/intel/llvm/issues/18421 - if (launchPropList[propIndex].id == UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE || - launchPropList[propIndex].id == - UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE) { - continue; - } - return UR_RESULT_ERROR_INVALID_OPERATION; - } - - clSetKernelArgMemPointerINTEL_fn SetKernelArgMemPointerPtr = nullptr; - UR_RETURN_ON_FAILURE( - cl_ext::getExtFuncFromContext( - hQueue->Context->CLContext, - ur::cl::getAdapter()->fnCache.clSetKernelArgMemPointerINTELCache, - cl_ext::SetKernelArgMemPointerName, &SetKernelArgMemPointerPtr)); - - for (uint32_t i = 0; i < numArgs; i++) { - switch (pArgs[i].type) { - case UR_EXP_KERNEL_ARG_TYPE_LOCAL: - CL_RETURN_ON_FAILURE(clSetKernelArg(hKernel->CLKernel, - static_cast(pArgs[i].index), - pArgs[i].size, nullptr)); - break; - case UR_EXP_KERNEL_ARG_TYPE_VALUE: - CL_RETURN_ON_FAILURE(clSetKernelArg(hKernel->CLKernel, - static_cast(pArgs[i].index), - pArgs[i].size, pArgs[i].value.value)); - break; - case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: { - cl_mem mem = pArgs[i].value.memObjTuple.hMem - ? pArgs[i].value.memObjTuple.hMem->CLMemory - : nullptr; - CL_RETURN_ON_FAILURE(clSetKernelArg(hKernel->CLKernel, - static_cast(pArgs[i].index), - pArgs[i].size, &mem)); - break; - } - case UR_EXP_KERNEL_ARG_TYPE_POINTER: - CL_RETURN_ON_FAILURE(SetKernelArgMemPointerPtr( - hKernel->CLKernel, static_cast(pArgs[i].index), - pArgs[i].value.pointer)); - break; - case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: { - CL_RETURN_ON_FAILURE(clSetKernelArg( - hKernel->CLKernel, static_cast(pArgs[i].index), - pArgs[i].size, &pArgs[i].value.sampler->CLSampler)); - break; - } - default: - return UR_RESULT_ERROR_INVALID_ENUMERATION; - } - } - - std::vector compiledLocalWorksize; - if (!pLocalWorkSize) { - cl_device_id device = nullptr; - CL_RETURN_ON_FAILURE(clGetCommandQueueInfo( - hQueue->CLQueue, CL_QUEUE_DEVICE, sizeof(device), &device, nullptr)); - // This query always returns size_t[3], if nothing was specified it - // returns all zeroes. - size_t queriedLocalWorkSize[3] = {0, 0, 0}; - CL_RETURN_ON_FAILURE(clGetKernelWorkGroupInfo( - hKernel->CLKernel, device, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, - sizeof(size_t[3]), queriedLocalWorkSize, nullptr)); - if (queriedLocalWorkSize[0] != 0) { - for (uint32_t i = 0; i < 3; i++) { - compiledLocalWorksize.push_back(queriedLocalWorkSize[i]); - } - } - } - - cl_event Event; - std::vector CLWaitEvents(numEventsInWaitList); - MapUREventsToCL(numEventsInWaitList, phEventWaitList, CLWaitEvents); - CL_RETURN_ON_FAILURE(clEnqueueNDRangeKernel( - hQueue->CLQueue, hKernel->CLKernel, workDim, pGlobalWorkOffset, - pGlobalWorkSize, - compiledLocalWorksize.empty() ? pLocalWorkSize - : compiledLocalWorksize.data(), - numEventsInWaitList, CLWaitEvents.data(), ifUrEvent(phEvent, Event))); - - UR_RETURN_ON_FAILURE(createUREvent(Event, hQueue->Context, hQueue, phEvent)); - return UR_RESULT_SUCCESS; -} diff --git a/source/adapters/opencl/ur_interface_loader.cpp b/source/adapters/opencl/ur_interface_loader.cpp index 18cc6a7965..c619fa36b1 100644 --- a/source/adapters/opencl/ur_interface_loader.cpp +++ b/source/adapters/opencl/ur_interface_loader.cpp @@ -434,7 +434,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; pDdiTable->pfnNativeCommandExp = urEnqueueNativeCommandExp; pDdiTable->pfnCommandBufferExp = urEnqueueCommandBufferExp; - pDdiTable->pfnKernelLaunchWithArgsExp = urEnqueueKernelLaunchWithArgsExp; return UR_RESULT_SUCCESS; } diff --git a/source/common/stype_map_helpers.def b/source/common/stype_map_helpers.def index efd69e6ae4..7970582639 100644 --- a/source/common/stype_map_helpers.def +++ b/source/common/stype_map_helpers.def @@ -168,6 +168,3 @@ struct stype_map template <> struct stype_map : stype_map_impl {}; -template <> -struct stype_map - : stype_map_impl {}; diff --git a/source/loader/layers/sanitizer/asan/asan_ddi.cpp b/source/loader/layers/sanitizer/asan/asan_ddi.cpp index a8d26f5498..899ff6a850 100644 --- a/source/loader/layers/sanitizer/asan/asan_ddi.cpp +++ b/source/loader/layers/sanitizer/asan/asan_ddi.cpp @@ -1637,119 +1637,6 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetInfo( return pfnGetInfo(hDevice, propName, propSize, pPropValue, pPropSizeRet); } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Intercept function for urEnqueueKernelLaunch -ur_result_t urEnqueueKernelLaunchWithArgsExp( - /// [in] handle of the queue object - ur_queue_handle_t hQueue, - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] number of dimensions, from 1 to 3, to specify the global and - /// work-group work-items - uint32_t workDim, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the offset used to calculate the global ID of a work-item - const size_t *pGlobalWorkOffset, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of global work-items in workDim that will execute the kernel - /// function - const size_t *pGlobalWorkSize, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the number of local work-items forming a work-group that will - /// execute the kernel function. - /// If nullptr, the runtime implementation will choose the work-group size. - const size_t *pLocalWorkSize, - /// [in] Number of entries in pArgs - uint32_t numArgs, - /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg - /// properties. - const ur_exp_kernel_arg_properties_t *pArgs, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch - /// properties - const ur_kernel_launch_property_t *launchPropList, - /// [in] size of the event wait list - uint32_t numEventsInWaitList, - /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of - /// events that must be complete before the kernel execution. If - /// nullptr, the numEventsInWaitList must be 0, indicating that no wait - /// event. - const ur_event_handle_t *phEventWaitList, - /// [out][optional] return an event object that identifies this - /// particular kernel execution instance. - ur_event_handle_t *phEvent) { - // This mutex is to prevent concurrent kernel launches across different queues - // as the DeviceASAN local/private shadow memory does not support concurrent - // kernel launches now. - std::scoped_lock Guard( - getAsanInterceptor()->KernelLaunchMutex); - - UR_LOG_L(getContext()->logger, DEBUG, - "==== urEnqueueKernelLaunchWithArgsExp"); - - // We need to set all the args now rather than letting LaunchWithArgs handle - // them. This is because some implementations of - // urKernelGetSuggestedLocalWorkSize, which is used in preLaunchKernel, rely - // on all the args being set. - for (uint32_t ArgPropIndex = 0; ArgPropIndex < numArgs; ArgPropIndex++) { - switch (pArgs[ArgPropIndex].type) { - case UR_EXP_KERNEL_ARG_TYPE_LOCAL: { - UR_CALL(ur_sanitizer_layer::asan::urKernelSetArgLocal( - hKernel, pArgs[ArgPropIndex].index, pArgs[ArgPropIndex].size, - nullptr)); - break; - } - case UR_EXP_KERNEL_ARG_TYPE_POINTER: { - UR_CALL(ur_sanitizer_layer::asan::urKernelSetArgPointer( - hKernel, pArgs[ArgPropIndex].index, nullptr, - pArgs[ArgPropIndex].value.pointer)); - break; - } - case UR_EXP_KERNEL_ARG_TYPE_VALUE: { - UR_CALL(ur_sanitizer_layer::asan::urKernelSetArgValue( - hKernel, pArgs[ArgPropIndex].index, pArgs[ArgPropIndex].size, nullptr, - pArgs[ArgPropIndex].value.value)); - break; - } - case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: { - ur_kernel_arg_mem_obj_properties_t Properties = { - UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES, nullptr, - pArgs[ArgPropIndex].value.memObjTuple.flags}; - UR_CALL(ur_sanitizer_layer::asan::urKernelSetArgMemObj( - hKernel, pArgs[ArgPropIndex].index, &Properties, - pArgs[ArgPropIndex].value.memObjTuple.hMem)); - break; - } - case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: { - auto pfnKernelSetArgSampler = - getContext()->urDdiTable.Kernel.pfnSetArgSampler; - UR_CALL(pfnKernelSetArgSampler(hKernel, pArgs[ArgPropIndex].index, - nullptr, - pArgs[ArgPropIndex].value.sampler)); - break; - } - default: - return UR_RESULT_ERROR_INVALID_ENUMERATION; - } - } - - LaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), pGlobalWorkSize, - pLocalWorkSize, pGlobalWorkOffset, 3); - UR_CALL(LaunchInfo.Data.syncToDevice(hQueue)); - - UR_CALL(getAsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo)); - - UR_CALL(getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp( - hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - LaunchInfo.LocalWorkSize.data(), 0, nullptr, numPropsInLaunchPropList, - launchPropList, numEventsInWaitList, phEventWaitList, phEvent)); - - UR_CALL(getAsanInterceptor()->postLaunchKernel(hKernel, hQueue, LaunchInfo)); - - return UR_RESULT_SUCCESS; -} - /////////////////////////////////////////////////////////////////////////////// /// @brief Exported function for filling application's Adapter table /// with current process' addresses @@ -2065,22 +1952,6 @@ __urdlllocal ur_result_t UR_APICALL urGetDeviceProcAddrTable( return result; } -/// @brief Exported function for filling application's ProgramExp table -/// with current process' addresses -/// -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -ur_result_t urGetEnqueueExpProcAddrTable( - /// [in,out] pointer to table of DDI function pointers - ur_enqueue_exp_dditable_t *pDdiTable) { - ur_result_t result = UR_RESULT_SUCCESS; - - pDdiTable->pfnKernelLaunchWithArgsExp = - ur_sanitizer_layer::asan::urEnqueueKernelLaunchWithArgsExp; - - return result; -} template struct NotSupportedApi; @@ -2277,11 +2148,6 @@ ur_result_t initAsanDDITable(ur_dditable_t *dditable) { UR_API_VERSION_CURRENT, &dditable->VirtualMem); } - if (UR_RESULT_SUCCESS == result) { - result = ur_sanitizer_layer::asan::urGetEnqueueExpProcAddrTable( - &dditable->EnqueueExp); - } - if (result != UR_RESULT_SUCCESS) { UR_LOG_L(getContext()->logger, ERR, "Initialize ASAN DDI table failed: {}", result); diff --git a/source/loader/layers/sanitizer/msan/msan_ddi.cpp b/source/loader/layers/sanitizer/msan/msan_ddi.cpp index 13868606dc..e2bbb166a5 100644 --- a/source/loader/layers/sanitizer/msan/msan_ddi.cpp +++ b/source/loader/layers/sanitizer/msan/msan_ddi.cpp @@ -1809,122 +1809,6 @@ ur_result_t urEnqueueUSMMemcpy2D( return UR_RESULT_SUCCESS; } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Intercept function for urEnqueueKernelLaunch -ur_result_t urEnqueueKernelLaunchWithArgsExp( - /// [in] handle of the queue object - ur_queue_handle_t hQueue, - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] number of dimensions, from 1 to 3, to specify the global and - /// work-group work-items - uint32_t workDim, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the offset used to calculate the global ID of a work-item - const size_t *pGlobalWorkOffset, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of global work-items in workDim that will execute the kernel - /// function - const size_t *pGlobalWorkSize, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the number of local work-items forming a work-group that will - /// execute the kernel function. - /// If nullptr, the runtime implementation will choose the work-group size. - const size_t *pLocalWorkSize, - /// [in] Number of entries in pArgs - uint32_t numArgs, - /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg - /// properties. - const ur_exp_kernel_arg_properties_t *pArgs, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch - /// properties - const ur_kernel_launch_property_t *launchPropList, - /// [in] size of the event wait list - uint32_t numEventsInWaitList, - /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of - /// events that must be complete before the kernel execution. If - /// nullptr, the numEventsInWaitList must be 0, indicating that no wait - /// event. - const ur_event_handle_t *phEventWaitList, - /// [out][optional] return an event object that identifies this - /// particular kernel execution instance. - ur_event_handle_t *phEvent) { - // This mutex is to prevent concurrent kernel launches across different queues - // as the DeviceMSAN local/private shadow memory does not support concurrent - // kernel launches now. - std::scoped_lock Guard( - getMsanInterceptor()->KernelLaunchMutex); - - UR_LOG_L(getContext()->logger, DEBUG, - "==== urEnqueueKernelLaunchWithArgsExp"); - - // We need to set all the args now rather than letting LaunchWithArgs handle - // them. This is because some implementations of - // urKernelGetSuggestedLocalWorkSize, which is used in preLaunchKernel, rely - // on all the args being set. - for (uint32_t ArgPropIndex = 0; ArgPropIndex < numArgs; ArgPropIndex++) { - switch (pArgs[ArgPropIndex].type) { - case UR_EXP_KERNEL_ARG_TYPE_LOCAL: { - UR_CALL(ur_sanitizer_layer::msan::urKernelSetArgLocal( - hKernel, pArgs[ArgPropIndex].index, pArgs[ArgPropIndex].size, - nullptr)); - break; - } - case UR_EXP_KERNEL_ARG_TYPE_POINTER: { - auto pfnKernelSetArgPointer = - getContext()->urDdiTable.Kernel.pfnSetArgPointer; - UR_CALL(pfnKernelSetArgPointer(hKernel, pArgs[ArgPropIndex].index, - nullptr, - pArgs[ArgPropIndex].value.pointer)); - break; - } - case UR_EXP_KERNEL_ARG_TYPE_VALUE: { - UR_CALL(ur_sanitizer_layer::msan::urKernelSetArgValue( - hKernel, pArgs[ArgPropIndex].index, pArgs[ArgPropIndex].size, nullptr, - pArgs[ArgPropIndex].value.value)); - break; - } - case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: { - ur_kernel_arg_mem_obj_properties_t Properties = { - UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES, nullptr, - pArgs[ArgPropIndex].value.memObjTuple.flags}; - UR_CALL(ur_sanitizer_layer::msan::urKernelSetArgMemObj( - hKernel, pArgs[ArgPropIndex].index, &Properties, - pArgs[ArgPropIndex].value.memObjTuple.hMem)); - break; - } - case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: { - auto pfnKernelSetArgSampler = - getContext()->urDdiTable.Kernel.pfnSetArgSampler; - UR_CALL(pfnKernelSetArgSampler(hKernel, pArgs[ArgPropIndex].index, - nullptr, - pArgs[ArgPropIndex].value.sampler)); - break; - } - default: - return UR_RESULT_ERROR_INVALID_ENUMERATION; - } - } - - USMLaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), - pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset, - 3); - UR_CALL(LaunchInfo.initialize()); - - UR_CALL(getMsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo)); - - UR_CALL(getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp( - hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - LaunchInfo.LocalWorkSize.data(), 0, nullptr, numPropsInLaunchPropList, - launchPropList, numEventsInWaitList, phEventWaitList, phEvent)); - - UR_CALL(getMsanInterceptor()->postLaunchKernel(hKernel, hQueue, LaunchInfo)); - - return UR_RESULT_SUCCESS; -} - /////////////////////////////////////////////////////////////////////////////// /// @brief Exported function for filling application's Adapter table /// with current process' addresses @@ -2103,22 +1987,6 @@ ur_result_t urGetUSMProcAddrTable( return result; } -/// @brief Exported function for filling application's ProgramExp table -/// with current process' addresses -/// -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -ur_result_t urGetEnqueueExpProcAddrTable( - /// [in,out] pointer to table of DDI function pointers - ur_enqueue_exp_dditable_t *pDdiTable) { - ur_result_t result = UR_RESULT_SUCCESS; - - pDdiTable->pfnKernelLaunchWithArgsExp = - ur_sanitizer_layer::msan::urEnqueueKernelLaunchWithArgsExp; - - return result; -} ur_result_t urCheckVersion(ur_api_version_t version) { if (UR_MAJOR_VERSION(ur_sanitizer_layer::getContext()->version) != @@ -2184,11 +2052,6 @@ ur_result_t initMsanDDITable(ur_dditable_t *dditable) { result = ur_sanitizer_layer::msan::urGetUSMProcAddrTable(&dditable->USM); } - if (UR_RESULT_SUCCESS == result) { - result = ur_sanitizer_layer::msan::urGetEnqueueExpProcAddrTable( - &dditable->EnqueueExp); - } - if (result != UR_RESULT_SUCCESS) { UR_LOG_L(getContext()->logger, ERR, "Initialize MSAN DDI table failed: {}", result); diff --git a/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp b/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp index 7934dbf596..61849ac0b3 100644 --- a/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp +++ b/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp @@ -1337,120 +1337,6 @@ ur_result_t urEnqueueKernelLaunch( return UR_RESULT_SUCCESS; } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Intercept function for urEnqueueKernelLaunch -ur_result_t urEnqueueKernelLaunchWithArgsExp( - /// [in] handle of the queue object - ur_queue_handle_t hQueue, - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] number of dimensions, from 1 to 3, to specify the global and - /// work-group work-items - uint32_t workDim, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the offset used to calculate the global ID of a work-item - const size_t *pGlobalWorkOffset, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of global work-items in workDim that will execute the kernel - /// function - const size_t *pGlobalWorkSize, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the number of local work-items forming a work-group that will - /// execute the kernel function. - /// If nullptr, the runtime implementation will choose the work-group size. - const size_t *pLocalWorkSize, - /// [in] Number of entries in pArgs - uint32_t numArgs, - /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg - /// properties. - const ur_exp_kernel_arg_properties_t *pArgs, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch - /// properties - const ur_kernel_launch_property_t *launchPropList, - /// [in] size of the event wait list - uint32_t numEventsInWaitList, - /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of - /// events that must be complete before the kernel execution. If - /// nullptr, the numEventsInWaitList must be 0, indicating that no wait - /// event. - const ur_event_handle_t *phEventWaitList, - /// [out][optional] return an event object that identifies this - /// particular kernel execution instance. - ur_event_handle_t *phEvent) { - // This mutex is to prevent concurrent kernel launches across different queues - // as the DeviceTSAN local shadow memory does not support concurrent - // kernel launches now. - std::scoped_lock Guard( - getTsanInterceptor()->KernelLaunchMutex); - - UR_LOG_L(getContext()->logger, DEBUG, - "==== urEnqueueKernelLaunchWithArgsExp"); - - // We need to set all the args now rather than letting LaunchWithArgs handle - // them. This is because some implementations of - // urKernelGetSuggestedLocalWorkSize, which is used in preLaunchKernel, rely - // on all the args being set. - for (uint32_t ArgPropIndex = 0; ArgPropIndex < numArgs; ArgPropIndex++) { - switch (pArgs[ArgPropIndex].type) { - case UR_EXP_KERNEL_ARG_TYPE_LOCAL: { - UR_CALL(ur_sanitizer_layer::tsan::urKernelSetArgLocal( - hKernel, pArgs[ArgPropIndex].index, pArgs[ArgPropIndex].size, - nullptr)); - break; - } - case UR_EXP_KERNEL_ARG_TYPE_POINTER: { - auto pfnKernelSetArgPointer = - getContext()->urDdiTable.Kernel.pfnSetArgPointer; - UR_CALL(pfnKernelSetArgPointer(hKernel, pArgs[ArgPropIndex].index, - nullptr, - pArgs[ArgPropIndex].value.pointer)); - break; - } - case UR_EXP_KERNEL_ARG_TYPE_VALUE: { - UR_CALL(ur_sanitizer_layer::tsan::urKernelSetArgValue( - hKernel, pArgs[ArgPropIndex].index, pArgs[ArgPropIndex].size, nullptr, - pArgs[ArgPropIndex].value.value)); - break; - } - case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: { - ur_kernel_arg_mem_obj_properties_t Properties = { - UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES, nullptr, - pArgs[ArgPropIndex].value.memObjTuple.flags}; - UR_CALL(ur_sanitizer_layer::tsan::urKernelSetArgMemObj( - hKernel, pArgs[ArgPropIndex].index, &Properties, - pArgs[ArgPropIndex].value.memObjTuple.hMem)); - break; - } - case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: { - auto pfnKernelSetArgSampler = - getContext()->urDdiTable.Kernel.pfnSetArgSampler; - UR_CALL(pfnKernelSetArgSampler(hKernel, pArgs[ArgPropIndex].index, - nullptr, - pArgs[ArgPropIndex].value.sampler)); - break; - } - default: - return UR_RESULT_ERROR_INVALID_ENUMERATION; - } - } - - LaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), pGlobalWorkSize, - pLocalWorkSize, pGlobalWorkOffset, 3); - - UR_CALL(getTsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo)); - - UR_CALL(getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp( - hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, 0, nullptr, numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent)); - - UR_CALL(getTsanInterceptor()->postLaunchKernel(hKernel, hQueue, LaunchInfo)); - - return UR_RESULT_SUCCESS; -} - ur_result_t urCheckVersion(ur_api_version_t version) { if (UR_MAJOR_VERSION(ur_sanitizer_layer::getContext()->version) != UR_MAJOR_VERSION(version) || @@ -1661,22 +1547,6 @@ __urdlllocal ur_result_t UR_APICALL urGetEnqueueProcAddrTable( return UR_RESULT_SUCCESS; } -/// @brief Exported function for filling application's ProgramExp table -/// with current process' addresses -/// -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -ur_result_t urGetEnqueueExpProcAddrTable( - /// [in,out] pointer to table of DDI function pointers - ur_enqueue_exp_dditable_t *pDdiTable) { - ur_result_t result = UR_RESULT_SUCCESS; - - pDdiTable->pfnKernelLaunchWithArgsExp = - ur_sanitizer_layer::tsan::urEnqueueKernelLaunchWithArgsExp; - - return result; -} } // namespace tsan ur_result_t initTsanDDITable(ur_dditable_t *dditable) { @@ -1726,11 +1596,6 @@ ur_result_t initTsanDDITable(ur_dditable_t *dditable) { ur_sanitizer_layer::tsan::urGetEnqueueProcAddrTable(&dditable->Enqueue); } - if (UR_RESULT_SUCCESS == result) { - result = ur_sanitizer_layer::tsan::urGetEnqueueExpProcAddrTable( - &dditable->EnqueueExp); - } - if (result != UR_RESULT_SUCCESS) { UR_LOG_L(getContext()->logger, ERR, "Initialize TSAN DDI table failed: {}", result); diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp index 1cac607be8..e96e1cbffd 100644 --- a/source/loader/layers/tracing/ur_trcddi.cpp +++ b/source/loader/layers/tracing/ur_trcddi.cpp @@ -10093,98 +10093,6 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( return result; } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Intercept function for urEnqueueKernelLaunchWithArgsExp -__urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( - /// [in] handle of the queue object - ur_queue_handle_t hQueue, - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] number of dimensions, from 1 to 3, to specify the global and - /// work-group work-items - uint32_t workDim, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the offset used to calculate the global ID of a work-item - const size_t *pGlobalWorkOffset, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of global work-items in workDim that will execute the kernel - /// function - const size_t *pGlobalWorkSize, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the number of local work-items forming a work-group that will - /// execute the kernel function. - /// If nullptr, the runtime implementation will choose the work-group size. - const size_t *pLocalWorkSize, - /// [in] Number of entries in pArgs - uint32_t numArgs, - /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg - /// properties. - const ur_exp_kernel_arg_properties_t *pArgs, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list - /// of launch properties - const ur_kernel_launch_property_t *launchPropList, - /// [in] size of the event wait list - uint32_t numEventsInWaitList, - /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of - /// events that must be complete before the kernel execution. - /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait - /// event. - const ur_event_handle_t *phEventWaitList, - /// [out][optional][alloc] return an event object that identifies this - /// particular kernel execution instance. If phEventWaitList and phEvent - /// are not NULL, phEvent must not refer to an element of the - /// phEventWaitList array. - ur_event_handle_t *phEvent) { - auto pfnKernelLaunchWithArgsExp = - getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp; - - if (nullptr == pfnKernelLaunchWithArgsExp) - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - - ur_enqueue_kernel_launch_with_args_exp_params_t params = { - &hQueue, - &hKernel, - &workDim, - &pGlobalWorkOffset, - &pGlobalWorkSize, - &pLocalWorkSize, - &numArgs, - &pArgs, - &numPropsInLaunchPropList, - &launchPropList, - &numEventsInWaitList, - &phEventWaitList, - &phEvent}; - uint64_t instance = getContext()->notify_begin( - UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP, - "urEnqueueKernelLaunchWithArgsExp", ¶ms); - - auto &logger = getContext()->logger; - UR_LOG_L(logger, INFO, " ---> urEnqueueKernelLaunchWithArgsExp\n"); - - ur_result_t result = pfnKernelLaunchWithArgsExp( - hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numArgs, pArgs, numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent); - - getContext()->notify_end(UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP, - "urEnqueueKernelLaunchWithArgsExp", ¶ms, &result, - instance); - - if (logger.getLevel() <= UR_LOGGER_LEVEL_INFO) { - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_WITH_ARGS_EXP, ¶ms); - UR_LOG_L(logger, INFO, - " <--- urEnqueueKernelLaunchWithArgsExp({}) -> {};\n", - args_str.str(), result); - } - - return result; -} - /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urEnqueueEventsWaitWithBarrierExt __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrierExt( @@ -10795,10 +10703,6 @@ __urdlllocal ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( ur_result_t result = UR_RESULT_SUCCESS; - dditable.pfnKernelLaunchWithArgsExp = pDdiTable->pfnKernelLaunchWithArgsExp; - pDdiTable->pfnKernelLaunchWithArgsExp = - ur_tracing_layer::urEnqueueKernelLaunchWithArgsExp; - dditable.pfnUSMDeviceAllocExp = pDdiTable->pfnUSMDeviceAllocExp; pDdiTable->pfnUSMDeviceAllocExp = ur_tracing_layer::urEnqueueUSMDeviceAllocExp; diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index 9dd572ecd3..6f33aaa856 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -10867,122 +10867,6 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( return result; } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Intercept function for urEnqueueKernelLaunchWithArgsExp -__urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( - /// [in] handle of the queue object - ur_queue_handle_t hQueue, - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] number of dimensions, from 1 to 3, to specify the global and - /// work-group work-items - uint32_t workDim, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the offset used to calculate the global ID of a work-item - const size_t *pGlobalWorkOffset, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of global work-items in workDim that will execute the kernel - /// function - const size_t *pGlobalWorkSize, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the number of local work-items forming a work-group that will - /// execute the kernel function. - /// If nullptr, the runtime implementation will choose the work-group size. - const size_t *pLocalWorkSize, - /// [in] Number of entries in pArgs - uint32_t numArgs, - /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg - /// properties. - const ur_exp_kernel_arg_properties_t *pArgs, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list - /// of launch properties - const ur_kernel_launch_property_t *launchPropList, - /// [in] size of the event wait list - uint32_t numEventsInWaitList, - /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of - /// events that must be complete before the kernel execution. - /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait - /// event. - const ur_event_handle_t *phEventWaitList, - /// [out][optional][alloc] return an event object that identifies this - /// particular kernel execution instance. If phEventWaitList and phEvent - /// are not NULL, phEvent must not refer to an element of the - /// phEventWaitList array. - ur_event_handle_t *phEvent) { - auto pfnKernelLaunchWithArgsExp = - getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp; - - if (nullptr == pfnKernelLaunchWithArgsExp) { - return UR_RESULT_ERROR_UNINITIALIZED; - } - - if (getContext()->enableParameterValidation) { - if (NULL == pGlobalWorkSize) - return UR_RESULT_ERROR_INVALID_NULL_POINTER; - - if (launchPropList == NULL && numPropsInLaunchPropList > 0) - return UR_RESULT_ERROR_INVALID_NULL_POINTER; - - if (pArgs == NULL && numArgs > 0) - return UR_RESULT_ERROR_INVALID_NULL_POINTER; - - if (NULL == hQueue) - return UR_RESULT_ERROR_INVALID_NULL_HANDLE; - - if (NULL == hKernel) - return UR_RESULT_ERROR_INVALID_NULL_HANDLE; - - if (NULL != pArgs && UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type) - return UR_RESULT_ERROR_INVALID_ENUMERATION; - - if (phEventWaitList == NULL && numEventsInWaitList > 0) - return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; - - if (phEventWaitList != NULL && numEventsInWaitList == 0) - return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; - - if (pGlobalWorkSize[0] == 0 || pGlobalWorkSize[1] == 0 || - pGlobalWorkSize[2] == 0) - return UR_RESULT_ERROR_INVALID_WORK_DIMENSION; - - if (pLocalWorkSize && (pLocalWorkSize[0] == 0 || pLocalWorkSize[1] == 0 || - pLocalWorkSize[2] == 0)) - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - - if (phEventWaitList != NULL && numEventsInWaitList > 0) { - for (uint32_t i = 0; i < numEventsInWaitList; ++i) { - if (phEventWaitList[i] == NULL) { - return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; - } - } - } - } - - if (getContext()->enableLifetimeValidation && - !getContext()->refCountContext->isReferenceValid(hQueue)) { - URLOG_CTX_INVALID_REFERENCE(hQueue); - } - - if (getContext()->enableLifetimeValidation && - !getContext()->refCountContext->isReferenceValid(hKernel)) { - URLOG_CTX_INVALID_REFERENCE(hKernel); - } - - ur_result_t result = pfnKernelLaunchWithArgsExp( - hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numArgs, pArgs, numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent); - - if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS && - phEvent) { - getContext()->refCountContext->createRefCount(*phEvent); - } - - return result; -} - /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urEnqueueEventsWaitWithBarrierExt __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrierExt( @@ -11618,10 +11502,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( ur_result_t result = UR_RESULT_SUCCESS; - dditable.pfnKernelLaunchWithArgsExp = pDdiTable->pfnKernelLaunchWithArgsExp; - pDdiTable->pfnKernelLaunchWithArgsExp = - ur_validation_layer::urEnqueueKernelLaunchWithArgsExp; - dditable.pfnUSMDeviceAllocExp = pDdiTable->pfnUSMDeviceAllocExp; pDdiTable->pfnUSMDeviceAllocExp = ur_validation_layer::urEnqueueUSMDeviceAllocExp; diff --git a/source/loader/loader.def.in b/source/loader/loader.def.in index 516f465840..3ad4714931 100644 --- a/source/loader/loader.def.in +++ b/source/loader/loader.def.in @@ -76,7 +76,6 @@ EXPORTS urEnqueueEventsWaitWithBarrier urEnqueueEventsWaitWithBarrierExt urEnqueueKernelLaunch - urEnqueueKernelLaunchWithArgsExp urEnqueueMemBufferCopy urEnqueueMemBufferCopyRect urEnqueueMemBufferFill @@ -289,7 +288,6 @@ EXPORTS urPrintEnqueueEventsWaitWithBarrierExtParams urPrintEnqueueEventsWaitWithBarrierParams urPrintEnqueueKernelLaunchParams - urPrintEnqueueKernelLaunchWithArgsExpParams urPrintEnqueueMemBufferCopyParams urPrintEnqueueMemBufferCopyRectParams urPrintEnqueueMemBufferFillParams @@ -350,9 +348,6 @@ EXPORTS urPrintExpImageCopyFlags urPrintExpImageCopyRegion urPrintExpImageMemType - urPrintExpKernelArgMemObjTuple - urPrintExpKernelArgProperties - urPrintExpKernelArgType urPrintExpPeerInfo urPrintExpSamplerAddrModes urPrintExpSamplerCubemapFilterMode diff --git a/source/loader/loader.map.in b/source/loader/loader.map.in index a0e5b81244..fde803f9aa 100644 --- a/source/loader/loader.map.in +++ b/source/loader/loader.map.in @@ -76,7 +76,6 @@ urEnqueueEventsWaitWithBarrier; urEnqueueEventsWaitWithBarrierExt; urEnqueueKernelLaunch; - urEnqueueKernelLaunchWithArgsExp; urEnqueueMemBufferCopy; urEnqueueMemBufferCopyRect; urEnqueueMemBufferFill; @@ -289,7 +288,6 @@ urPrintEnqueueEventsWaitWithBarrierExtParams; urPrintEnqueueEventsWaitWithBarrierParams; urPrintEnqueueKernelLaunchParams; - urPrintEnqueueKernelLaunchWithArgsExpParams; urPrintEnqueueMemBufferCopyParams; urPrintEnqueueMemBufferCopyRectParams; urPrintEnqueueMemBufferFillParams; @@ -350,9 +348,6 @@ urPrintExpImageCopyFlags; urPrintExpImageCopyRegion; urPrintExpImageMemType; - urPrintExpKernelArgMemObjTuple; - urPrintExpKernelArgProperties; - urPrintExpKernelArgType; urPrintExpPeerInfo; urPrintExpSamplerAddrModes; urPrintExpSamplerCubemapFilterMode; diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp index 0a09a3072c..5c2c3a41af 100644 --- a/source/loader/ur_ldrddi.cpp +++ b/source/loader/ur_ldrddi.cpp @@ -5734,65 +5734,6 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( pPropValue, pPropSizeRet); } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Intercept function for urEnqueueKernelLaunchWithArgsExp -__urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( - /// [in] handle of the queue object - ur_queue_handle_t hQueue, - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] number of dimensions, from 1 to 3, to specify the global and - /// work-group work-items - uint32_t workDim, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the offset used to calculate the global ID of a work-item - const size_t *pGlobalWorkOffset, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of global work-items in workDim that will execute the kernel - /// function - const size_t *pGlobalWorkSize, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the number of local work-items forming a work-group that will - /// execute the kernel function. - /// If nullptr, the runtime implementation will choose the work-group size. - const size_t *pLocalWorkSize, - /// [in] Number of entries in pArgs - uint32_t numArgs, - /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg - /// properties. - const ur_exp_kernel_arg_properties_t *pArgs, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list - /// of launch properties - const ur_kernel_launch_property_t *launchPropList, - /// [in] size of the event wait list - uint32_t numEventsInWaitList, - /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of - /// events that must be complete before the kernel execution. - /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait - /// event. - const ur_event_handle_t *phEventWaitList, - /// [out][optional][alloc] return an event object that identifies this - /// particular kernel execution instance. If phEventWaitList and phEvent - /// are not NULL, phEvent must not refer to an element of the - /// phEventWaitList array. - ur_event_handle_t *phEvent) { - - auto *dditable = *reinterpret_cast(hQueue); - - auto *pfnKernelLaunchWithArgsExp = - dditable->EnqueueExp.pfnKernelLaunchWithArgsExp; - if (nullptr == pfnKernelLaunchWithArgsExp) - return UR_RESULT_ERROR_UNINITIALIZED; - - // forward to device-platform - return pfnKernelLaunchWithArgsExp( - hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numArgs, pArgs, numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent); -} - /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urEnqueueEventsWaitWithBarrierExt __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrierExt( @@ -6303,8 +6244,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( if (ur_loader::getContext()->platforms.size() != 1 || ur_loader::getContext()->forceIntercept) { // return pointers to loader's DDIs - pDdiTable->pfnKernelLaunchWithArgsExp = - ur_loader::urEnqueueKernelLaunchWithArgsExp; pDdiTable->pfnUSMDeviceAllocExp = ur_loader::urEnqueueUSMDeviceAllocExp; pDdiTable->pfnUSMSharedAllocExp = ur_loader::urEnqueueUSMSharedAllocExp; pDdiTable->pfnUSMHostAllocExp = ur_loader::urEnqueueUSMHostAllocExp; diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index 59edc89920..a31b639ae5 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -10560,104 +10560,6 @@ ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( return exceptionToResult(std::current_exception()); } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Enqueue a command to execute a kernel -/// -/// @remarks -/// _Analogues_ -/// - **clEnqueueNDRangeKernel** -/// -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_UNINITIALIZED -/// - ::UR_RESULT_ERROR_DEVICE_LOST -/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC -/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE -/// + `NULL == hQueue` -/// + `NULL == hKernel` -/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// + `NULL == pGlobalWorkSize` -/// + `launchPropList == NULL && numPropsInLaunchPropList > 0` -/// + `pArgs == NULL && numArgs > 0` -/// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `NULL != pArgs && ::UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type` -/// - ::UR_RESULT_ERROR_INVALID_QUEUE -/// - ::UR_RESULT_ERROR_INVALID_KERNEL -/// - ::UR_RESULT_ERROR_INVALID_EVENT -/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST -/// + `phEventWaitList == NULL && numEventsInWaitList > 0` -/// + `phEventWaitList != NULL && numEventsInWaitList == 0` -/// + If event objects in phEventWaitList are not valid events. -/// - ::UR_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS -/// + An event in `phEventWaitList` has ::UR_EVENT_STATUS_ERROR. -/// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION -/// + `pGlobalWorkSize[0] == 0 || pGlobalWorkSize[1] == 0 || -/// pGlobalWorkSize[2] == 0` -/// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE -/// + `pLocalWorkSize && (pLocalWorkSize[0] == 0 || pLocalWorkSize[1] == -/// 0 || pLocalWorkSize[2] == 0)` -/// - ::UR_RESULT_ERROR_INVALID_VALUE -/// - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGS - "The kernel argument values -/// have not been specified." -/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY -/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES -/// - ::UR_RESULT_ERROR_INVALID_OPERATION -/// + If any property in `launchPropList` isn't supported by the device. -ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( - /// [in] handle of the queue object - ur_queue_handle_t hQueue, - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] number of dimensions, from 1 to 3, to specify the global and - /// work-group work-items - uint32_t workDim, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the offset used to calculate the global ID of a work-item - const size_t *pGlobalWorkOffset, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of global work-items in workDim that will execute the kernel - /// function - const size_t *pGlobalWorkSize, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the number of local work-items forming a work-group that will - /// execute the kernel function. - /// If nullptr, the runtime implementation will choose the work-group size. - const size_t *pLocalWorkSize, - /// [in] Number of entries in pArgs - uint32_t numArgs, - /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg - /// properties. - const ur_exp_kernel_arg_properties_t *pArgs, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list - /// of launch properties - const ur_kernel_launch_property_t *launchPropList, - /// [in] size of the event wait list - uint32_t numEventsInWaitList, - /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of - /// events that must be complete before the kernel execution. - /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait - /// event. - const ur_event_handle_t *phEventWaitList, - /// [out][optional][alloc] return an event object that identifies this - /// particular kernel execution instance. If phEventWaitList and phEvent - /// are not NULL, phEvent must not refer to an element of the - /// phEventWaitList array. - ur_event_handle_t *phEvent) try { - auto pfnKernelLaunchWithArgsExp = - ur_lib::getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp; - if (nullptr == pfnKernelLaunchWithArgsExp) - return UR_RESULT_ERROR_UNINITIALIZED; - - return pfnKernelLaunchWithArgsExp( - hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numArgs, pArgs, numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent); -} catch (...) { - return exceptionToResult(std::current_exception()); -} - /////////////////////////////////////////////////////////////////////////////// /// @brief Enqueue a barrier command which waits a list of events to complete /// before it completes, with optional extended properties diff --git a/source/loader/ur_print.cpp b/source/loader/ur_print.cpp index 0fee8b9ee2..f3d5c96e37 100644 --- a/source/loader/ur_print.cpp +++ b/source/loader/ur_print.cpp @@ -1138,30 +1138,6 @@ ur_result_t urPrintExpPeerInfo(enum ur_exp_peer_info_t value, char *buffer, return str_copy(&ss, buffer, buff_size, out_size); } -ur_result_t urPrintExpKernelArgType(enum ur_exp_kernel_arg_type_t value, - char *buffer, const size_t buff_size, - size_t *out_size) { - std::stringstream ss; - ss << value; - return str_copy(&ss, buffer, buff_size, out_size); -} - -ur_result_t urPrintExpKernelArgMemObjTuple( - const struct ur_exp_kernel_arg_mem_obj_tuple_t params, char *buffer, - const size_t buff_size, size_t *out_size) { - std::stringstream ss; - ss << params; - return str_copy(&ss, buffer, buff_size, out_size); -} - -ur_result_t urPrintExpKernelArgProperties( - const struct ur_exp_kernel_arg_properties_t params, char *buffer, - const size_t buff_size, size_t *out_size) { - std::stringstream ss; - ss << params; - return str_copy(&ss, buffer, buff_size, out_size); -} - ur_result_t urPrintExpEnqueueExtFlags(enum ur_exp_enqueue_ext_flag_t value, char *buffer, const size_t buff_size, size_t *out_size) { @@ -1893,14 +1869,6 @@ ur_result_t urPrintEnqueueWriteHostPipeParams( return str_copy(&ss, buffer, buff_size, out_size); } -ur_result_t urPrintEnqueueKernelLaunchWithArgsExpParams( - const struct ur_enqueue_kernel_launch_with_args_exp_params_t *params, - char *buffer, const size_t buff_size, size_t *out_size) { - std::stringstream ss; - ss << params; - return str_copy(&ss, buffer, buff_size, out_size); -} - ur_result_t urPrintEnqueueEventsWaitWithBarrierExtParams( const struct ur_enqueue_events_wait_with_barrier_ext_params_t *params, char *buffer, const size_t buff_size, size_t *out_size) { diff --git a/source/ur_api.cpp b/source/ur_api.cpp index 771e27c3b8..da84b7f50f 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -9190,95 +9190,6 @@ ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( return result; } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Enqueue a command to execute a kernel -/// -/// @remarks -/// _Analogues_ -/// - **clEnqueueNDRangeKernel** -/// -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_UNINITIALIZED -/// - ::UR_RESULT_ERROR_DEVICE_LOST -/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC -/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE -/// + `NULL == hQueue` -/// + `NULL == hKernel` -/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// + `NULL == pGlobalWorkSize` -/// + `launchPropList == NULL && numPropsInLaunchPropList > 0` -/// + `pArgs == NULL && numArgs > 0` -/// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `NULL != pArgs && ::UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type` -/// - ::UR_RESULT_ERROR_INVALID_QUEUE -/// - ::UR_RESULT_ERROR_INVALID_KERNEL -/// - ::UR_RESULT_ERROR_INVALID_EVENT -/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST -/// + `phEventWaitList == NULL && numEventsInWaitList > 0` -/// + `phEventWaitList != NULL && numEventsInWaitList == 0` -/// + If event objects in phEventWaitList are not valid events. -/// - ::UR_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS -/// + An event in `phEventWaitList` has ::UR_EVENT_STATUS_ERROR. -/// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION -/// + `pGlobalWorkSize[0] == 0 || pGlobalWorkSize[1] == 0 || -/// pGlobalWorkSize[2] == 0` -/// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE -/// + `pLocalWorkSize && (pLocalWorkSize[0] == 0 || pLocalWorkSize[1] == -/// 0 || pLocalWorkSize[2] == 0)` -/// - ::UR_RESULT_ERROR_INVALID_VALUE -/// - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGS - "The kernel argument values -/// have not been specified." -/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY -/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES -/// - ::UR_RESULT_ERROR_INVALID_OPERATION -/// + If any property in `launchPropList` isn't supported by the device. -ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( - /// [in] handle of the queue object - ur_queue_handle_t hQueue, - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] number of dimensions, from 1 to 3, to specify the global and - /// work-group work-items - uint32_t workDim, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the offset used to calculate the global ID of a work-item - const size_t *pGlobalWorkOffset, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of global work-items in workDim that will execute the kernel - /// function - const size_t *pGlobalWorkSize, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the number of local work-items forming a work-group that will - /// execute the kernel function. - /// If nullptr, the runtime implementation will choose the work-group size. - const size_t *pLocalWorkSize, - /// [in] Number of entries in pArgs - uint32_t numArgs, - /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg - /// properties. - const ur_exp_kernel_arg_properties_t *pArgs, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list - /// of launch properties - const ur_kernel_launch_property_t *launchPropList, - /// [in] size of the event wait list - uint32_t numEventsInWaitList, - /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of - /// events that must be complete before the kernel execution. - /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait - /// event. - const ur_event_handle_t *phEventWaitList, - /// [out][optional][alloc] return an event object that identifies this - /// particular kernel execution instance. If phEventWaitList and phEvent - /// are not NULL, phEvent must not refer to an element of the - /// phEventWaitList array. - ur_event_handle_t *phEvent) { - ur_result_t result = UR_RESULT_SUCCESS; - return result; -} - /////////////////////////////////////////////////////////////////////////////// /// @brief Enqueue a barrier command which waits a list of events to complete /// before it completes, with optional extended properties diff --git a/test/conformance/CMakeLists.txt b/test/conformance/CMakeLists.txt index c1ca49f8e9..5d579dbbf5 100644 --- a/test/conformance/CMakeLists.txt +++ b/test/conformance/CMakeLists.txt @@ -79,7 +79,6 @@ set(TEST_SUBDIRECTORIES_DPCXX "integration" "exp_command_buffer" "exp_enqueue_native" - "exp_enqueue_kernel_launch_with_args" "exp_usm_p2p" "memory-migrate" "usm" diff --git a/test/conformance/exp_enqueue_kernel_launch_with_args/CMakeLists.txt b/test/conformance/exp_enqueue_kernel_launch_with_args/CMakeLists.txt deleted file mode 100644 index d03e5ef7c0..0000000000 --- a/test/conformance/exp_enqueue_kernel_launch_with_args/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (C) 2025 Intel Corporation -# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. -# See LICENSE.TXT -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -add_conformance_kernels_test( - exp_kernel_launch_with_args - urEnqueueKernelLaunchWithArgsExp.cpp -) diff --git a/test/conformance/exp_enqueue_kernel_launch_with_args/urEnqueueKernelLaunchWithArgsExp.cpp b/test/conformance/exp_enqueue_kernel_launch_with_args/urEnqueueKernelLaunchWithArgsExp.cpp deleted file mode 100644 index 093bc56004..0000000000 --- a/test/conformance/exp_enqueue_kernel_launch_with_args/urEnqueueKernelLaunchWithArgsExp.cpp +++ /dev/null @@ -1,303 +0,0 @@ -// Copyright (C) 2025 Intel Corporation -// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM -// Exceptions. See LICENSE.TXT -// -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include - -#include - -// This test runs a kernel with a mix of local memory, pointer and value args. -struct urEnqueueKernelLaunchWithArgsTest : uur::urKernelExecutionTest { - void SetUp() override { - program_name = "saxpy_usm_local_mem"; - UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); - - ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, - sizeof(backend), &backend, nullptr)); - - // HIP has extra args for local memory so we define an offset for arg - // indices here for updating - hip_arg_offset = backend == UR_BACKEND_HIP ? 3 : 0; - ur_device_usm_access_capability_flags_t shared_usm_flags; - ASSERT_SUCCESS( - uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags)); - if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) { - GTEST_SKIP() << "Shared USM is not supported."; - } - - const size_t allocation_size = - sizeof(uint32_t) * global_size[0] * local_size[0]; - for (auto &shared_ptr : shared_ptrs) { - ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, - allocation_size, &shared_ptr)); - ASSERT_NE(shared_ptr, nullptr); - - std::vector pattern(allocation_size); - uur::generateMemFillPattern(pattern); - std::memcpy(shared_ptr, pattern.data(), allocation_size); - } - uint32_t current_index = 0; - // Index 0 is local_mem_a arg - args.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, - nullptr, - UR_EXP_KERNEL_ARG_TYPE_LOCAL, - current_index++, - local_mem_a_size, - {nullptr}}); - - // Hip has extra args for local mem at index 1-3 - ur_exp_kernel_arg_value_t argValue = {}; - if (backend == UR_BACKEND_HIP) { - argValue.value = &hip_local_offset; - ur_exp_kernel_arg_properties_t local_offset = { - UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, - nullptr, - UR_EXP_KERNEL_ARG_TYPE_VALUE, - current_index++, - sizeof(hip_local_offset), - argValue}; - args.push_back(local_offset); - local_offset.index = current_index++; - args.push_back(local_offset); - local_offset.index = current_index++; - args.push_back(local_offset); - } - - // Index 1 is local_mem_b arg - args.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, - nullptr, - UR_EXP_KERNEL_ARG_TYPE_LOCAL, - current_index++, - local_mem_b_size, - {nullptr}}); - - if (backend == UR_BACKEND_HIP) { - argValue.value = &hip_local_offset; - ur_exp_kernel_arg_properties_t local_offset = { - UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, - nullptr, - UR_EXP_KERNEL_ARG_TYPE_VALUE, - current_index++, - sizeof(hip_local_offset), - argValue}; - args.push_back(local_offset); - local_offset.index = current_index++; - args.push_back(local_offset); - local_offset.index = current_index++; - args.push_back(local_offset); - } - - // Index 2 is output - argValue.pointer = shared_ptrs[0]; - args.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr, - UR_EXP_KERNEL_ARG_TYPE_POINTER, current_index++, - sizeof(shared_ptrs[0]), argValue}); - // Index 3 is A - argValue.value = &A; - args.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr, - UR_EXP_KERNEL_ARG_TYPE_VALUE, current_index++, sizeof(A), - argValue}); - // Index 4 is X - argValue.pointer = shared_ptrs[1]; - args.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr, - UR_EXP_KERNEL_ARG_TYPE_POINTER, current_index++, - sizeof(shared_ptrs[1]), argValue}); - // Index 5 is Y - argValue.pointer = shared_ptrs[2]; - args.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr, - UR_EXP_KERNEL_ARG_TYPE_POINTER, current_index++, - sizeof(shared_ptrs[2]), argValue}); - } - - void Validate(uint32_t *output, uint32_t *X, uint32_t *Y, uint32_t A, - size_t length, size_t local_size) { - for (size_t i = 0; i < length; i++) { - uint32_t result = A * X[i] + Y[i] + local_size; - ASSERT_EQ(result, output[i]); - } - } - - virtual void TearDown() override { - for (auto &shared_ptr : shared_ptrs) { - if (shared_ptr) { - EXPECT_SUCCESS(urUSMFree(context, shared_ptr)); - } - } - - UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::TearDown()); - } - - static constexpr size_t local_size[3] = {4, 1, 1}; - static constexpr size_t local_mem_a_size = local_size[0] * sizeof(uint32_t); - static constexpr size_t local_mem_b_size = local_mem_a_size * 2; - static constexpr size_t global_size[3] = {16, 1, 1}; - static constexpr size_t global_offset[3] = {0, 0, 0}; - static constexpr uint32_t workDim = 3; - static constexpr uint32_t A = 42; - std::array shared_ptrs = {nullptr, nullptr, nullptr, nullptr, - nullptr}; - - uint32_t hip_arg_offset = 0; - static constexpr uint64_t hip_local_offset = 0; - ur_backend_t backend{}; - std::vector args; -}; -UUR_INSTANTIATE_DEVICE_TEST_SUITE(urEnqueueKernelLaunchWithArgsTest); - -TEST_P(urEnqueueKernelLaunchWithArgsTest, Success) { - ASSERT_SUCCESS(urEnqueueKernelLaunchWithArgsExp( - queue, kernel, workDim, global_offset, global_size, local_size, - args.size(), args.data(), 0, nullptr, 0, nullptr, nullptr)); - ASSERT_SUCCESS(urQueueFinish(queue)); - - uint32_t *output = (uint32_t *)shared_ptrs[0]; - uint32_t *X = (uint32_t *)shared_ptrs[1]; - uint32_t *Y = (uint32_t *)shared_ptrs[2]; - Validate(output, X, Y, A, global_size[0], local_size[0]); -} - -TEST_P(urEnqueueKernelLaunchWithArgsTest, InvalidNullHandleQueue) { - ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE, - urEnqueueKernelLaunchWithArgsExp( - nullptr, kernel, workDim, global_offset, global_size, - local_size, args.size(), args.data(), 0, nullptr, 0, - nullptr, nullptr)); -} - -TEST_P(urEnqueueKernelLaunchWithArgsTest, InvalidNullHandleKernel) { - ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE, - urEnqueueKernelLaunchWithArgsExp( - queue, nullptr, workDim, global_offset, global_size, - local_size, args.size(), args.data(), 0, nullptr, 0, - nullptr, nullptr)); -} - -TEST_P(urEnqueueKernelLaunchWithArgsTest, InvalidNullPointerGlobalSize) { - ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER, - urEnqueueKernelLaunchWithArgsExp( - queue, kernel, workDim, global_offset, nullptr, - local_size, args.size(), args.data(), 0, nullptr, 0, - nullptr, nullptr)); -} - -TEST_P(urEnqueueKernelLaunchWithArgsTest, InvalidNullPointerProperties) { - ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER, - urEnqueueKernelLaunchWithArgsExp( - queue, kernel, workDim, global_offset, global_size, - local_size, args.size(), args.data(), 1, nullptr, 0, - nullptr, nullptr)); -} - -TEST_P(urEnqueueKernelLaunchWithArgsTest, InvalidNullPointerArgs) { - ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER, - urEnqueueKernelLaunchWithArgsExp( - queue, kernel, workDim, global_offset, global_size, - local_size, args.size(), nullptr, 0, nullptr, 0, nullptr, - nullptr)); -} - -TEST_P(urEnqueueKernelLaunchWithArgsTest, InvalidEventWaitList) { - ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST, - urEnqueueKernelLaunchWithArgsExp( - queue, kernel, workDim, global_offset, global_size, - local_size, args.size(), args.data(), 0, nullptr, 1, - nullptr, nullptr)); - ur_event_handle_t event = nullptr; - ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST, - urEnqueueKernelLaunchWithArgsExp( - queue, kernel, workDim, global_offset, global_size, - local_size, args.size(), args.data(), 0, nullptr, 0, - &event, nullptr)); -} - -// This test runs a kernel with a buffer (MEM_OBJ) arg. -struct urEnqueueKernelLaunchWithArgsMemObjTest : uur::urKernelExecutionTest { - void SetUp() override { - program_name = "fill"; - UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); - - ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, - sizeof(backend), &backend, nullptr)); - - ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, - sizeof(val) * global_size[0], nullptr, - &buffer)); - - char zero = 0; - ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, buffer, &zero, sizeof(zero), 0, - buffer_size, 0, nullptr, nullptr)); - ASSERT_SUCCESS(urQueueFinish(queue)); - - // First argument is buffer to fill - unsigned current_arg_index = 0; - ur_exp_kernel_arg_mem_obj_tuple_t buffer_and_properties = {buffer, 0}; - ur_exp_kernel_arg_properties_t arg = { - UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, - nullptr, - UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ, - current_arg_index++, - sizeof(buffer), - {nullptr}}; - arg.value.memObjTuple = buffer_and_properties; - args.push_back(arg); - - // Add accessor arguments depending on backend. - // HIP has 3 offset parameters and other backends only have 1. - if (backend == UR_BACKEND_HIP) { - arg.type = UR_EXP_KERNEL_ARG_TYPE_VALUE; - arg.size = sizeof(hip_local_offset); - arg.value.value = &hip_local_offset; - arg.index = current_arg_index++; - args.push_back(arg); - arg.index = current_arg_index++; - args.push_back(arg); - arg.index = current_arg_index++; - args.push_back(arg); - } else { - arg.type = UR_EXP_KERNEL_ARG_TYPE_VALUE; - arg.index = current_arg_index++; - arg.size = sizeof(accessor); - arg.value.value = &accessor; - args.push_back(arg); - } - - // Second user defined argument is scalar to fill with. - arg.type = UR_EXP_KERNEL_ARG_TYPE_VALUE; - arg.index = current_arg_index++; - arg.size = sizeof(val); - arg.value.value = &val; - args.push_back(arg); - } - - void TearDown() override { - if (buffer) { - EXPECT_SUCCESS(urMemRelease(buffer)); - } - - UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::TearDown()); - } - - static constexpr uint32_t val = 42; - static constexpr size_t global_size[3] = {32, 1, 1}; - static constexpr uint32_t workDim = 3; - static constexpr size_t buffer_size = sizeof(val) * global_size[0]; - static constexpr uint64_t hip_local_offset = 0; - ur_backend_t backend{}; - ur_mem_handle_t buffer = nullptr; - // This is the accessor offset struct sycl kernels expect to accompany buffer args. - struct { - size_t offsets[1] = {0}; - } accessor; - std::vector args; -}; -UUR_INSTANTIATE_DEVICE_TEST_SUITE(urEnqueueKernelLaunchWithArgsMemObjTest); - -TEST_P(urEnqueueKernelLaunchWithArgsMemObjTest, Success) { - ASSERT_SUCCESS(urEnqueueKernelLaunchWithArgsExp( - queue, kernel, workDim, nullptr, global_size, nullptr, args.size(), - args.data(), 0, nullptr, 0, nullptr, nullptr)); - ASSERT_SUCCESS(urQueueFinish(queue)); - ValidateBuffer(buffer, buffer_size, val); -} From 2f223c597d854a87227764fbe44b34cd2247d6b0 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 5 Aug 2025 00:45:22 +0000 Subject: [PATCH 3/3] Update intel/llvm mirror base commit to d25d6d69 --- .github/intel-llvm-mirror-base-commit | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/intel-llvm-mirror-base-commit b/.github/intel-llvm-mirror-base-commit index 513bceac10..8abe736e8e 100644 --- a/.github/intel-llvm-mirror-base-commit +++ b/.github/intel-llvm-mirror-base-commit @@ -1 +1 @@ -9f81215b9beaeeefe7bb1dac850cedf5adb1b343 +d25d6d6919de5c15e69e5b69d848b05876da23a3