diff --git a/sycl/source/detail/device_global_map_entry.cpp b/sycl/source/detail/device_global_map_entry.cpp index f2bc2d5b998fd..908ffbc0bd942 100644 --- a/sycl/source/detail/device_global_map_entry.cpp +++ b/sycl/source/detail/device_global_map_entry.cpp @@ -87,6 +87,48 @@ DeviceGlobalMapEntry::getOrAllocateDeviceGlobalUSM(queue_impl &QueueImpl) { return NewAlloc; } +DeviceGlobalUSMMem & +DeviceGlobalMapEntry::getOrAllocateDeviceGlobalUSM(const context &Context) { + assert(!MIsDeviceImageScopeDecorated && + "USM allocations should not be acquired for device_global with " + "device_image_scope property."); + const std::shared_ptr &CtxImpl = getSyclObjImpl(Context); + const std::shared_ptr &DevImpl = + getSyclObjImpl(CtxImpl->getDevices().front()); + std::lock_guard Lock(MDeviceToUSMPtrMapMutex); + + auto DGUSMPtr = MDeviceToUSMPtrMap.find({DevImpl.get(), CtxImpl.get()}); + if (DGUSMPtr != MDeviceToUSMPtrMap.end()) + return DGUSMPtr->second; + + void *NewDGUSMPtr = detail::usm::alignedAllocInternal( + 0, MDeviceGlobalTSize, CtxImpl.get(), DevImpl.get(), + sycl::usm::alloc::device); + + auto NewAllocIt = MDeviceToUSMPtrMap.emplace( + std::piecewise_construct, + std::forward_as_tuple(DevImpl.get(), CtxImpl.get()), + std::forward_as_tuple(NewDGUSMPtr)); + assert(NewAllocIt.second && + "USM allocation for device and context already happened."); + DeviceGlobalUSMMem &NewAlloc = NewAllocIt.first->second; + + // C++ guarantees members appear in memory in the order they are declared, + // so since the member variable that contains the initial contents of the + // device_global is right after the usm_ptr member variable we can do + // some pointer arithmetic to memcopy over this value to the usm_ptr. This + // value inside of the device_global will be zero-initialized if it was not + // given a value on construction. + MemoryManager::context_copy_usm( + reinterpret_cast( + reinterpret_cast(MDeviceGlobalPtr) + + sizeof(MDeviceGlobalPtr)), + CtxImpl, MDeviceGlobalTSize, NewAlloc.MPtr); + + CtxImpl->addAssociatedDeviceGlobal(MDeviceGlobalPtr); + return NewAlloc; +} + void DeviceGlobalMapEntry::removeAssociatedResources( const context_impl *CtxImpl) { std::lock_guard Lock{MDeviceToUSMPtrMapMutex}; diff --git a/sycl/source/detail/device_global_map_entry.hpp b/sycl/source/detail/device_global_map_entry.hpp index 10d122c967677..cfa86a6639e43 100644 --- a/sycl/source/detail/device_global_map_entry.hpp +++ b/sycl/source/detail/device_global_map_entry.hpp @@ -111,6 +111,11 @@ struct DeviceGlobalMapEntry { // Gets or allocates USM memory for a device_global. DeviceGlobalUSMMem &getOrAllocateDeviceGlobalUSM(queue_impl &QueueImpl); + // This overload allows the allocation to be initialized without a queue. The + // UR adapter in use must report true for + // UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP to take advantage of this. + DeviceGlobalUSMMem &getOrAllocateDeviceGlobalUSM(const context &Context); + // Removes resources for device_globals associated with the context. void removeAssociatedResources(const context_impl *CtxImpl); diff --git a/sycl/source/detail/kernel_bundle_impl.hpp b/sycl/source/detail/kernel_bundle_impl.hpp index 5f54d75b860ad..b0747f22c725d 100644 --- a/sycl/source/detail/kernel_bundle_impl.hpp +++ b/sycl/source/detail/kernel_bundle_impl.hpp @@ -665,13 +665,21 @@ class kernel_bundle_impl { "'device_image_scope' property"); } - // TODO: Add context-only initialization via `urUSMContextMemcpyExp` instead - // of using a throw-away queue. - queue InitQueue{MContext, Dev}; - auto &USMMem = - Entry->getOrAllocateDeviceGlobalUSM(*getSyclObjImpl(InitQueue)); - InitQueue.wait_and_throw(); - return USMMem.getPtr(); + const auto &DeviceImpl = getSyclObjImpl(Dev); + bool SupportContextMemcpy = false; + DeviceImpl->getAdapter()->call( + DeviceImpl->getHandleRef(), + UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP, + sizeof(SupportContextMemcpy), &SupportContextMemcpy, nullptr); + if (SupportContextMemcpy) { + return Entry->getOrAllocateDeviceGlobalUSM(MContext).getPtr(); + } else { + queue InitQueue{MContext, Dev}; + auto &USMMem = + Entry->getOrAllocateDeviceGlobalUSM(*getSyclObjImpl(InitQueue)); + InitQueue.wait_and_throw(); + return USMMem.getPtr(); + } } size_t ext_oneapi_get_device_global_size(const std::string &Name) const { diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index 800da83f73760..cc20d700c672f 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -891,6 +891,16 @@ void MemoryManager::copy_usm(const void *SrcMem, queue_impl &SrcQueue, DepEvents.data(), OutEvent); } +void MemoryManager::context_copy_usm(const void *SrcMem, ContextImplPtr Context, + size_t Len, void *DstMem) { + if (!SrcMem || !DstMem) + throw exception(make_error_code(errc::invalid), + "NULL pointer argument in memory copy operation."); + const AdapterPtr &Adapter = Context->getAdapter(); + Adapter->call(Context->getHandleRef(), + DstMem, SrcMem, Len); +} + void MemoryManager::fill_usm(void *Mem, queue_impl &Queue, size_t Length, const std::vector &Pattern, std::vector DepEvents, diff --git a/sycl/source/detail/memory_manager.hpp b/sycl/source/detail/memory_manager.hpp index adb68f9f7e421..30d790189fad2 100644 --- a/sycl/source/detail/memory_manager.hpp +++ b/sycl/source/detail/memory_manager.hpp @@ -140,6 +140,9 @@ class MemoryManager { void *DstMem, std::vector DepEvents, ur_event_handle_t *OutEvent); + static void context_copy_usm(const void *SrcMem, ContextImplPtr Context, + size_t Len, void *DstMem); + static void fill_usm(void *DstMem, queue_impl &Queue, size_t Len, const std::vector &Pattern, std::vector DepEvents, diff --git a/unified-runtime/include/ur_api.h b/unified-runtime/include/ur_api.h index 0b6ce6ed4280f..9c967b411131d 100644 --- a/unified-runtime/include/ur_api.h +++ b/unified-runtime/include/ur_api.h @@ -465,6 +465,8 @@ typedef enum ur_function_t { UR_FUNCTION_BINDLESS_IMAGES_FREE_MAPPED_LINEAR_MEMORY_EXP = 271, /// Enumerator for ::urKernelSuggestMaxCooperativeGroupCount UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT = 272, + /// Enumerator for ::urUSMContextMemcpyExp + UR_FUNCTION_USM_CONTEXT_MEMCPY_EXP = 273, /// @cond UR_FUNCTION_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -2419,6 +2421,9 @@ typedef enum ur_device_info_t { /// [::ur_bool_t] Returns true if the device supports the multi device /// compile experimental feature. UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP = 0x6000, + /// [::ur_bool_t] returns true if the device supports + /// ::urUSMContextMemcpyExp + UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP = 0x7000, /// @cond UR_DEVICE_INFO_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -2444,7 +2449,7 @@ typedef enum ur_device_info_t { /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP < propName` +/// + `::UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -12414,6 +12419,43 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramLinkExp( /// [out][alloc] pointer to handle of program object created. ur_program_handle_t *phProgram); +#if !defined(__GNUC__) +#pragma endregion +#endif +// Intel 'oneAPI' Unified Runtime Experimental APIs for USM Context Memcpy +#if !defined(__GNUC__) +#pragma region usm_context_memcpy_(experimental) +#endif +/////////////////////////////////////////////////////////////////////////////// +/// @brief Perform a synchronous, blocking memcpy operation between USM +/// allocations. +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hContext` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pDst` +/// + `NULL == pSrc` +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// + `size == 0` +/// + If `size` is higher than the allocation size of `pSrc` or `pDst` +UR_APIEXPORT ur_result_t UR_APICALL urUSMContextMemcpyExp( + /// [in] Context associated with the device(s) that own the allocations + /// `pSrc` and `pDst`. + ur_context_handle_t hContext, + /// [in] Destination pointer to copy to. + void *pDst, + /// [in] Source pointer to copy from. + const void *pSrc, + /// [in] Size in bytes to be copied. + size_t size); + #if !defined(__GNUC__) #pragma endregion #endif @@ -14535,6 +14577,17 @@ typedef struct ur_usm_pitched_alloc_exp_params_t { size_t **ppResultPitch; } ur_usm_pitched_alloc_exp_params_t; +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function parameters for urUSMContextMemcpyExp +/// @details Each entry is a pointer to the parameter passed to the function; +/// allowing the callback the ability to modify the parameter's value +typedef struct ur_usm_context_memcpy_exp_params_t { + ur_context_handle_t *phContext; + void **ppDst; + const void **ppSrc; + size_t *psize; +} ur_usm_context_memcpy_exp_params_t; + /////////////////////////////////////////////////////////////////////////////// /// @brief Function parameters for urUSMImportExp /// @details Each entry is a pointer to the parameter passed to the function; diff --git a/unified-runtime/include/ur_api_funcs.def b/unified-runtime/include/ur_api_funcs.def index 7b8e0b49f2e5c..6419a99fba364 100644 --- a/unified-runtime/include/ur_api_funcs.def +++ b/unified-runtime/include/ur_api_funcs.def @@ -158,6 +158,7 @@ _UR_API(urUSMPoolSetDevicePoolExp) _UR_API(urUSMPoolGetDevicePoolExp) _UR_API(urUSMPoolTrimToExp) _UR_API(urUSMPitchedAllocExp) +_UR_API(urUSMContextMemcpyExp) _UR_API(urUSMImportExp) _UR_API(urUSMReleaseExp) _UR_API(urBindlessImagesUnsampledImageHandleDestroyExp) diff --git a/unified-runtime/include/ur_ddi.h b/unified-runtime/include/ur_ddi.h index f0c6d14e56a6f..c80b5cee73e18 100644 --- a/unified-runtime/include/ur_ddi.h +++ b/unified-runtime/include/ur_ddi.h @@ -1317,6 +1317,11 @@ typedef ur_result_t(UR_APICALL *ur_pfnUSMPitchedAllocExp_t)( ur_context_handle_t, ur_device_handle_t, const ur_usm_desc_t *, ur_usm_pool_handle_t, size_t, size_t, size_t, void **, size_t *); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function-pointer for urUSMContextMemcpyExp +typedef ur_result_t(UR_APICALL *ur_pfnUSMContextMemcpyExp_t)( + ur_context_handle_t, void *, const void *, size_t); + /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urUSMImportExp typedef ur_result_t(UR_APICALL *ur_pfnUSMImportExp_t)(ur_context_handle_t, @@ -1339,6 +1344,7 @@ typedef struct ur_usm_exp_dditable_t { ur_pfnUSMPoolGetDevicePoolExp_t pfnPoolGetDevicePoolExp; ur_pfnUSMPoolTrimToExp_t pfnPoolTrimToExp; ur_pfnUSMPitchedAllocExp_t pfnPitchedAllocExp; + ur_pfnUSMContextMemcpyExp_t pfnContextMemcpyExp; ur_pfnUSMImportExp_t pfnImportExp; ur_pfnUSMReleaseExp_t pfnReleaseExp; } ur_usm_exp_dditable_t; diff --git a/unified-runtime/include/ur_print.h b/unified-runtime/include/ur_print.h index f281f18aff0a5..62964fbb4e2f0 100644 --- a/unified-runtime/include/ur_print.h +++ b/unified-runtime/include/ur_print.h @@ -2945,6 +2945,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintUsmPitchedAllocExpParams( const struct ur_usm_pitched_alloc_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_usm_context_memcpy_exp_params_t struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintUsmContextMemcpyExpParams( + const struct ur_usm_context_memcpy_exp_params_t *params, char *buffer, + const size_t buff_size, size_t *out_size); + /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_usm_import_exp_params_t struct /// @returns diff --git a/unified-runtime/include/ur_print.hpp b/unified-runtime/include/ur_print.hpp index e039cca3cbf85..52613755d7c3e 100644 --- a/unified-runtime/include/ur_print.hpp +++ b/unified-runtime/include/ur_print.hpp @@ -1257,6 +1257,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT: os << "UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT"; break; + case UR_FUNCTION_USM_CONTEXT_MEMCPY_EXP: + os << "UR_FUNCTION_USM_CONTEXT_MEMCPY_EXP"; + break; default: os << "unknown enumerator"; break; @@ -3099,6 +3102,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_device_info_t value) { case UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP: os << "UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP"; break; + case UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP: + os << "UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP"; + break; default: os << "unknown enumerator"; break; @@ -5224,6 +5230,19 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, os << ")"; } break; + case UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP: { + const ur_bool_t *tptr = (const ur_bool_t *)ptr; + if (sizeof(ur_bool_t) > size) { + os << "invalid size (is: " << size + << ", expected: >=" << sizeof(ur_bool_t) << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + os << *tptr; + + os << ")"; + } break; default: os << "unknown enumerator"; return UR_RESULT_ERROR_INVALID_ENUMERATION; @@ -17831,6 +17850,36 @@ inline std::ostream &operator<<( return os; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_usm_context_memcpy_exp_params_t type +/// @returns +/// std::ostream & +inline std::ostream &operator<<( + std::ostream &os, + [[maybe_unused]] const struct ur_usm_context_memcpy_exp_params_t *params) { + + os << ".hContext = "; + + ur::details::printPtr(os, *(params->phContext)); + + os << ", "; + os << ".pDst = "; + + ur::details::printPtr(os, *(params->ppDst)); + + os << ", "; + os << ".pSrc = "; + + ur::details::printPtr(os, *(params->ppSrc)); + + os << ", "; + os << ".size = "; + + os << *(params->psize); + + return os; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_usm_import_exp_params_t type /// @returns @@ -21269,6 +21318,9 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, case UR_FUNCTION_USM_PITCHED_ALLOC_EXP: { os << (const struct ur_usm_pitched_alloc_exp_params_t *)params; } break; + case UR_FUNCTION_USM_CONTEXT_MEMCPY_EXP: { + os << (const struct ur_usm_context_memcpy_exp_params_t *)params; + } break; case UR_FUNCTION_USM_IMPORT_EXP: { os << (const struct ur_usm_import_exp_params_t *)params; } break; diff --git a/unified-runtime/scripts/core/EXP-USM-CONTEXT-MEMCPY.rst b/unified-runtime/scripts/core/EXP-USM-CONTEXT-MEMCPY.rst new file mode 100644 index 0000000000000..9af1999deeb25 --- /dev/null +++ b/unified-runtime/scripts/core/EXP-USM-CONTEXT-MEMCPY.rst @@ -0,0 +1,63 @@ +<% + OneApi=tags['$OneApi'] + x=tags['$x'] + X=x.upper() +%> + +.. _experimental-usm-context-memcpy: + +================================================================================ +USM Context Memcpy +================================================================================ + +.. warning:: + + Experimental features: + + * May be replaced, updated, or removed at any time. + * Do not require maintaining API/ABI stability of their own additions over + time. + * Do not require conformance testing of their own additions. + + +Motivation +-------------------------------------------------------------------------------- + +In order to support device globals there's a need for a blocking USM write +operation that doesn't need a queue. This is to facilitate fast initialization +of the device global memory via native APIs that enable this kind of operation. + +API +-------------------------------------------------------------------------------- + +Enums +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* ${x}_device_info_t + * ${X}_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP + +Functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +* ${x}USMContextMemcpyExp + +Changelog +-------------------------------------------------------------------------------- + ++-----------+---------------------------+ +| Revision | Changes | ++===========+===========================+ +| 1.0 | Initial Draft | ++-----------+---------------------------+ + + +Support +-------------------------------------------------------------------------------- + +Adapters which support this experimental feature *must* return true for the new +``${X}_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP`` device info query. + + +Contributors +-------------------------------------------------------------------------------- + +* Aaron Greig `aaron.greig@codeplay.com ` diff --git a/unified-runtime/scripts/core/exp-usm-context-memcpy.yml b/unified-runtime/scripts/core/exp-usm-context-memcpy.yml new file mode 100644 index 0000000000000..40c4328e28832 --- /dev/null +++ b/unified-runtime/scripts/core/exp-usm-context-memcpy.yml @@ -0,0 +1,48 @@ +# +# Copyright (C) 2025 Intel Corporation +# +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# See YaML.md for syntax definition +# +--- #-------------------------------------------------------------------------- +type: header +desc: "Intel $OneApi Unified Runtime Experimental APIs for USM Context Memcpy" +ordinal: "99" +--- #-------------------------------------------------------------------------- +type: enum +extend: true +typed_etors: true +desc: "Extension enums to $x_device_info_t to support $xUSMContextMemcpy" +name: $x_device_info_t +etors: + - name: USM_CONTEXT_MEMCPY_SUPPORT_EXP + value: "0x7000" + desc: "[$x_bool_t] returns true if the device supports $xUSMContextMemcpyExp" +--- #-------------------------------------------------------------------------- +type: function +desc: "Perform a synchronous, blocking memcpy operation between USM allocations." +class: $xUSM +name: ContextMemcpyExp +ordinal: "0" +params: + - type: $x_context_handle_t + name: hContext + desc: "[in] Context associated with the device(s) that own the allocations `pSrc` and `pDst`." + - type: void* + name: pDst + desc: "[in] Destination pointer to copy to." + - type: const void* + name: pSrc + desc: "[in] Source pointer to copy from." + - type: size_t + name: size + desc: "[in] Size in bytes to be copied." +returns: + - $X_RESULT_SUCCESS + - $X_RESULT_ERROR_ADAPTER_SPECIFIC + - $X_RESULT_ERROR_INVALID_SIZE: + - "`size == 0`" + - "If `size` is higher than the allocation size of `pSrc` or `pDst`" diff --git a/unified-runtime/scripts/core/registry.yml b/unified-runtime/scripts/core/registry.yml index cd3ad5f0905d5..81845b0db9afa 100644 --- a/unified-runtime/scripts/core/registry.yml +++ b/unified-runtime/scripts/core/registry.yml @@ -655,6 +655,9 @@ etors: - name: KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT desc: Enumerator for $xKernelSuggestMaxCooperativeGroupCount value: '272' +- name: USM_CONTEXT_MEMCPY_EXP + desc: Enumerator for $xUSMContextMemcpyExp + value: '273' --- type: enum desc: Defines structure types diff --git a/unified-runtime/source/adapters/cuda/device.cpp b/unified-runtime/source/adapters/cuda/device.cpp index 2cb43ebc88356..f7ef8c6e8ef2b 100644 --- a/unified-runtime/source/adapters/cuda/device.cpp +++ b/unified-runtime/source/adapters/cuda/device.cpp @@ -1166,6 +1166,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(true); case UR_DEVICE_INFO_LOW_POWER_EVENTS_SUPPORT_EXP: return ReturnValue(false); + case UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP: + return ReturnValue(false); case UR_DEVICE_INFO_USE_NATIVE_ASSERT: return ReturnValue(true); case UR_DEVICE_INFO_USM_P2P_SUPPORT_EXP: diff --git a/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp b/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp index 6fa9687f3979c..e60bdd11c59d7 100644 --- a/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp @@ -384,6 +384,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetUSMExpProcAddrTable( pDdiTable->pfnPoolSetDevicePoolExp = urUSMPoolSetDevicePoolExp; pDdiTable->pfnPoolGetDevicePoolExp = urUSMPoolGetDevicePoolExp; pDdiTable->pfnPoolTrimToExp = urUSMPoolTrimToExp; + pDdiTable->pfnContextMemcpyExp = urUSMContextMemcpyExp; return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/cuda/usm.cpp b/unified-runtime/source/adapters/cuda/usm.cpp index 723abf4be16c7..a1d8b9455c2f6 100644 --- a/unified-runtime/source/adapters/cuda/usm.cpp +++ b/unified-runtime/source/adapters/cuda/usm.cpp @@ -567,3 +567,9 @@ urUSMPoolTrimToExp(ur_context_handle_t hContext, ur_device_handle_t hDevice, return UR_RESULT_SUCCESS; } + +UR_APIEXPORT ur_result_t UR_APICALL urUSMContextMemcpyExp(ur_context_handle_t, + void *, const void *, + size_t) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/unified-runtime/source/adapters/hip/device.cpp b/unified-runtime/source/adapters/hip/device.cpp index f991dbf7db416..74e1f7e18dc2a 100644 --- a/unified-runtime/source/adapters/hip/device.cpp +++ b/unified-runtime/source/adapters/hip/device.cpp @@ -1031,11 +1031,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, } case UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP: return ReturnValue(false); + case UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP: + return ReturnValue(false); case UR_DEVICE_INFO_COMMAND_BUFFER_SUBGRAPH_SUPPORT_EXP: return ReturnValue(true); - case UR_DEVICE_INFO_LOW_POWER_EVENTS_SUPPORT_EXP: { + case UR_DEVICE_INFO_LOW_POWER_EVENTS_SUPPORT_EXP: return ReturnValue(false); - } case UR_DEVICE_INFO_USE_NATIVE_ASSERT: return ReturnValue(true); case UR_DEVICE_INFO_USM_P2P_SUPPORT_EXP: @@ -1103,7 +1104,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(ur_platform_handle_t hPlatform, /// Gets the native HIP handle of a UR device object /// /// \param[in] hDevice The UR device to get the native HIP object of. -/// \param[out] phNativeHandle Set to the native handle of the UR device object. +/// \param[out] phNativeHandle Set to the native handle of the UR device +/// object. /// /// \return UR_RESULT_SUCCESS UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle( diff --git a/unified-runtime/source/adapters/hip/ur_interface_loader.cpp b/unified-runtime/source/adapters/hip/ur_interface_loader.cpp index 9e7bc69982822..be4491d4a7985 100644 --- a/unified-runtime/source/adapters/hip/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/hip/ur_interface_loader.cpp @@ -381,6 +381,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetUSMExpProcAddrTable( pDdiTable->pfnPoolSetDevicePoolExp = urUSMPoolSetDevicePoolExp; pDdiTable->pfnPoolGetDevicePoolExp = urUSMPoolGetDevicePoolExp; pDdiTable->pfnPoolTrimToExp = urUSMPoolTrimToExp; + pDdiTable->pfnContextMemcpyExp = urUSMContextMemcpyExp; return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/hip/usm.cpp b/unified-runtime/source/adapters/hip/usm.cpp index 1945f6c24e055..f8baf04d88385 100644 --- a/unified-runtime/source/adapters/hip/usm.cpp +++ b/unified-runtime/source/adapters/hip/usm.cpp @@ -532,3 +532,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolTrimToExp(ur_context_handle_t, size_t) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + +UR_APIEXPORT ur_result_t UR_APICALL urUSMContextMemcpyExp(ur_context_handle_t, + void *, const void *, + size_t) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/unified-runtime/source/adapters/level_zero/device.cpp b/unified-runtime/source/adapters/level_zero/device.cpp index e54caf59f3b07..e3e8b45221ad3 100644 --- a/unified-runtime/source/adapters/level_zero/device.cpp +++ b/unified-runtime/source/adapters/level_zero/device.cpp @@ -1215,6 +1215,8 @@ ur_result_t urDeviceGetInfo( return ReturnValue(false); case UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORT: return ReturnValue(false); + case UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP: + return ReturnValue(true); case UR_DEVICE_INFO_USE_NATIVE_ASSERT: return ReturnValue(false); case UR_DEVICE_INFO_USM_P2P_SUPPORT_EXP: diff --git a/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp b/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp index fc5dda4d63714..fd3c5b126e1b1 100644 --- a/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp @@ -463,6 +463,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urGetUSMExpProcAddrTable( ur::level_zero::urUSMPoolGetDevicePoolExp; pDdiTable->pfnPoolTrimToExp = ur::level_zero::urUSMPoolTrimToExp; pDdiTable->pfnPitchedAllocExp = ur::level_zero::urUSMPitchedAllocExp; + pDdiTable->pfnContextMemcpyExp = ur::level_zero::urUSMContextMemcpyExp; pDdiTable->pfnImportExp = ur::level_zero::urUSMImportExp; pDdiTable->pfnReleaseExp = ur::level_zero::urUSMReleaseExp; diff --git a/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp b/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp index 53704ace96e0f..b6c683ca506d6 100644 --- a/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp +++ b/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp @@ -777,6 +777,8 @@ ur_result_t urProgramLinkExp(ur_context_handle_t hContext, uint32_t numDevices, const ur_program_handle_t *phPrograms, const char *pOptions, ur_program_handle_t *phProgram); +ur_result_t urUSMContextMemcpyExp(ur_context_handle_t hContext, void *pDst, + const void *pSrc, size_t size); ur_result_t urUSMImportExp(ur_context_handle_t hContext, void *pMem, size_t size); ur_result_t urUSMReleaseExp(ur_context_handle_t hContext, void *pMem); diff --git a/unified-runtime/source/adapters/level_zero/usm.cpp b/unified-runtime/source/adapters/level_zero/usm.cpp index c6abed7ccabb6..789bed11f8f93 100644 --- a/unified-runtime/source/adapters/level_zero/usm.cpp +++ b/unified-runtime/source/adapters/level_zero/usm.cpp @@ -673,6 +673,17 @@ ur_result_t UR_APICALL urUSMPoolTrimToExp(ur_context_handle_t, ur_usm_pool_handle_t, size_t) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + +ur_result_t UR_APICALL urUSMContextMemcpyExp(ur_context_handle_t Context, + void *pDst, const void *pSrc, + size_t Size) { + // zeCommandListAppendMemoryCopy must not be called from simultaneous + // threads with the same command list handle, so we need exclusive lock. + std::scoped_lock Lock(Context->ImmediateCommandListMutex); + ZE2UR_CALL(zeCommandListAppendMemoryCopy, (Context->ZeCommandListInit, pDst, + pSrc, Size, nullptr, 0, nullptr)); + return UR_RESULT_SUCCESS; +} } // namespace ur::level_zero static ur_result_t USMFreeImpl(ur_context_handle_t Context, void *Ptr) { diff --git a/unified-runtime/source/adapters/level_zero/v2/usm.cpp b/unified-runtime/source/adapters/level_zero/v2/usm.cpp index ec246a94cc06b..59073cfb2db3a 100644 --- a/unified-runtime/source/adapters/level_zero/v2/usm.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/usm.cpp @@ -565,4 +565,21 @@ ur_result_t urUSMReleaseExp(ur_context_handle_t hContext, void *hostPtr) { return UR_RESULT_SUCCESS; } +ur_result_t UR_APICALL urUSMContextMemcpyExp(ur_context_handle_t hContext, + void *pDst, const void *pSrc, + size_t size) { + ur_device_handle_t hDevice = hContext->getDevices()[0]; + auto Ordinal = static_cast( + hDevice + ->QueueGroup[ur_device_handle_t_::queue_group_info_t::type::Compute] + .ZeOrdinal); + auto commandList = hContext->getCommandListCache().getImmediateCommandList( + hDevice->ZeDevice, {true, Ordinal, true}, + ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, ZE_COMMAND_QUEUE_PRIORITY_NORMAL, + std::nullopt); + ZE2UR_CALL(zeCommandListAppendMemoryCopy, + (commandList.get(), pDst, pSrc, size, nullptr, 0, nullptr)); + return UR_RESULT_SUCCESS; +} + } // namespace ur::level_zero diff --git a/unified-runtime/source/adapters/mock/ur_mockddi.cpp b/unified-runtime/source/adapters/mock/ur_mockddi.cpp index 62435435772c0..b135876828efb 100644 --- a/unified-runtime/source/adapters/mock/ur_mockddi.cpp +++ b/unified-runtime/source/adapters/mock/ur_mockddi.cpp @@ -11406,6 +11406,55 @@ __urdlllocal ur_result_t UR_APICALL urProgramLinkExp( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urUSMContextMemcpyExp +__urdlllocal ur_result_t UR_APICALL urUSMContextMemcpyExp( + /// [in] Context associated with the device(s) that own the allocations + /// `pSrc` and `pDst`. + ur_context_handle_t hContext, + /// [in] Destination pointer to copy to. + void *pDst, + /// [in] Source pointer to copy from. + const void *pSrc, + /// [in] Size in bytes to be copied. + size_t size) try { + ur_result_t result = UR_RESULT_SUCCESS; + + ur_usm_context_memcpy_exp_params_t params = {&hContext, &pDst, &pSrc, &size}; + + auto beforeCallback = reinterpret_cast( + mock::getCallbacks().get_before_callback("urUSMContextMemcpyExp")); + if (beforeCallback) { + result = beforeCallback(¶ms); + if (result != UR_RESULT_SUCCESS) { + return result; + } + } + + auto replaceCallback = reinterpret_cast( + mock::getCallbacks().get_replace_callback("urUSMContextMemcpyExp")); + if (replaceCallback) { + result = replaceCallback(¶ms); + } else { + + result = UR_RESULT_SUCCESS; + } + + if (result != UR_RESULT_SUCCESS) { + return result; + } + + auto afterCallback = reinterpret_cast( + mock::getCallbacks().get_after_callback("urUSMContextMemcpyExp")); + if (afterCallback) { + return afterCallback(¶ms); + } + + return result; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urUSMImportExp __urdlllocal ur_result_t UR_APICALL urUSMImportExp( @@ -12651,6 +12700,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetUSMExpProcAddrTable( pDdiTable->pfnPitchedAllocExp = driver::urUSMPitchedAllocExp; + pDdiTable->pfnContextMemcpyExp = driver::urUSMContextMemcpyExp; + pDdiTable->pfnImportExp = driver::urUSMImportExp; pDdiTable->pfnReleaseExp = driver::urUSMReleaseExp; diff --git a/unified-runtime/source/adapters/native_cpu/device.cpp b/unified-runtime/source/adapters/native_cpu/device.cpp index 72f76005905f1..81bfe9b23dc18 100644 --- a/unified-runtime/source/adapters/native_cpu/device.cpp +++ b/unified-runtime/source/adapters/native_cpu/device.cpp @@ -454,6 +454,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_KERNEL_LAUNCH_CAPABILITIES: return ReturnValue(0); + case UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP: + return ReturnValue(false); + default: DIE_NO_IMPLEMENTATION; } diff --git a/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp b/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp index 8b43086cfa434..56ea0c76cc8c4 100644 --- a/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp @@ -383,6 +383,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetUSMExpProcAddrTable( return result; } pDdiTable->pfnPitchedAllocExp = urUSMPitchedAllocExp; + pDdiTable->pfnContextMemcpyExp = urUSMContextMemcpyExp; return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/native_cpu/usm.cpp b/unified-runtime/source/adapters/native_cpu/usm.cpp index fe29f348b7e27..c404e910c76ef 100644 --- a/unified-runtime/source/adapters/native_cpu/usm.cpp +++ b/unified-runtime/source/adapters/native_cpu/usm.cpp @@ -182,3 +182,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolTrimToExp(ur_context_handle_t, size_t) { DIE_NO_IMPLEMENTATION; } + +UR_APIEXPORT ur_result_t UR_APICALL urUSMContextMemcpyExp(ur_context_handle_t, + void *, const void *, + size_t) { + DIE_NO_IMPLEMENTATION; +} diff --git a/unified-runtime/source/adapters/opencl/device.cpp b/unified-runtime/source/adapters/opencl/device.cpp index 1dbde401219ac..bc6bd489ea957 100644 --- a/unified-runtime/source/adapters/opencl/device.cpp +++ b/unified-runtime/source/adapters/opencl/device.cpp @@ -1454,6 +1454,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_BINDLESS_SAMPLE_1D_USM_SUPPORT_EXP: case UR_DEVICE_INFO_BINDLESS_SAMPLE_2D_USM_SUPPORT_EXP: case UR_DEVICE_INFO_BINDLESS_IMAGES_GATHER_SUPPORT_EXP: + case UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP: return ReturnValue(false); case UR_DEVICE_INFO_IMAGE_PITCH_ALIGN_EXP: case UR_DEVICE_INFO_MAX_IMAGE_LINEAR_WIDTH_EXP: diff --git a/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp b/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp index 8015e632b4f0c..f74876d53d541 100644 --- a/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp @@ -264,6 +264,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetUSMExpProcAddrTable( pDdiTable->pfnImportExp = urUSMImportExp; pDdiTable->pfnReleaseExp = urUSMReleaseExp; + pDdiTable->pfnContextMemcpyExp = urUSMContextMemcpyExp; return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/opencl/usm.cpp b/unified-runtime/source/adapters/opencl/usm.cpp index 76ee6a2a0497a..e3c510c745766 100644 --- a/unified-runtime/source/adapters/opencl/usm.cpp +++ b/unified-runtime/source/adapters/opencl/usm.cpp @@ -800,3 +800,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolTrimToExp(ur_context_handle_t, size_t) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + +UR_APIEXPORT ur_result_t UR_APICALL urUSMContextMemcpyExp(ur_context_handle_t, + void *, const void *, + size_t) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp index 25fb682587635..49fb1c6424f7e 100644 --- a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp +++ b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp @@ -9649,6 +9649,47 @@ __urdlllocal ur_result_t UR_APICALL urProgramLinkExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urUSMContextMemcpyExp +__urdlllocal ur_result_t UR_APICALL urUSMContextMemcpyExp( + /// [in] Context associated with the device(s) that own the allocations + /// `pSrc` and `pDst`. + ur_context_handle_t hContext, + /// [in] Destination pointer to copy to. + void *pDst, + /// [in] Source pointer to copy from. + const void *pSrc, + /// [in] Size in bytes to be copied. + size_t size) { + auto pfnContextMemcpyExp = + getContext()->urDdiTable.USMExp.pfnContextMemcpyExp; + + if (nullptr == pfnContextMemcpyExp) + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + + ur_usm_context_memcpy_exp_params_t params = {&hContext, &pDst, &pSrc, &size}; + uint64_t instance = getContext()->notify_begin( + UR_FUNCTION_USM_CONTEXT_MEMCPY_EXP, "urUSMContextMemcpyExp", ¶ms); + + auto &logger = getContext()->logger; + UR_LOG_L(logger, INFO, " ---> urUSMContextMemcpyExp\n"); + + ur_result_t result = pfnContextMemcpyExp(hContext, pDst, pSrc, size); + + getContext()->notify_end(UR_FUNCTION_USM_CONTEXT_MEMCPY_EXP, + "urUSMContextMemcpyExp", ¶ms, &result, instance); + + if (logger.getLevel() <= UR_LOGGER_LEVEL_INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_USM_CONTEXT_MEMCPY_EXP, ¶ms); + UR_LOG_L(logger, INFO, " <--- urUSMContextMemcpyExp({}) -> {};\n", + args_str.str(), result); + } + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urUSMImportExp __urdlllocal ur_result_t UR_APICALL urUSMImportExp( @@ -11097,6 +11138,9 @@ __urdlllocal ur_result_t UR_APICALL urGetUSMExpProcAddrTable( dditable.pfnPitchedAllocExp = pDdiTable->pfnPitchedAllocExp; pDdiTable->pfnPitchedAllocExp = ur_tracing_layer::urUSMPitchedAllocExp; + dditable.pfnContextMemcpyExp = pDdiTable->pfnContextMemcpyExp; + pDdiTable->pfnContextMemcpyExp = ur_tracing_layer::urUSMContextMemcpyExp; + dditable.pfnImportExp = pDdiTable->pfnImportExp; pDdiTable->pfnImportExp = ur_tracing_layer::urUSMImportExp; diff --git a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp index e6876eae64237..b92ab79a386ae 100644 --- a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp +++ b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp @@ -569,7 +569,7 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetInfo( if (NULL == hDevice) return UR_RESULT_ERROR_INVALID_NULL_HANDLE; - if (UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP < propName) + if (UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP < propName) return UR_RESULT_ERROR_INVALID_ENUMERATION; if (propSize == 0 && pPropValue != NULL) @@ -10416,6 +10416,49 @@ __urdlllocal ur_result_t UR_APICALL urProgramLinkExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urUSMContextMemcpyExp +__urdlllocal ur_result_t UR_APICALL urUSMContextMemcpyExp( + /// [in] Context associated with the device(s) that own the allocations + /// `pSrc` and `pDst`. + ur_context_handle_t hContext, + /// [in] Destination pointer to copy to. + void *pDst, + /// [in] Source pointer to copy from. + const void *pSrc, + /// [in] Size in bytes to be copied. + size_t size) { + auto pfnContextMemcpyExp = + getContext()->urDdiTable.USMExp.pfnContextMemcpyExp; + + if (nullptr == pfnContextMemcpyExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + if (getContext()->enableParameterValidation) { + if (NULL == pDst) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + + if (NULL == pSrc) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + + if (NULL == hContext) + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + + if (size == 0) + return UR_RESULT_ERROR_INVALID_SIZE; + } + + if (getContext()->enableLifetimeValidation && + !getContext()->refCountContext->isReferenceValid(hContext)) { + URLOG_CTX_INVALID_REFERENCE(hContext); + } + + ur_result_t result = pfnContextMemcpyExp(hContext, pDst, pSrc, size); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urUSMImportExp __urdlllocal ur_result_t UR_APICALL urUSMImportExp( @@ -11902,6 +11945,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetUSMExpProcAddrTable( dditable.pfnPitchedAllocExp = pDdiTable->pfnPitchedAllocExp; pDdiTable->pfnPitchedAllocExp = ur_validation_layer::urUSMPitchedAllocExp; + dditable.pfnContextMemcpyExp = pDdiTable->pfnContextMemcpyExp; + pDdiTable->pfnContextMemcpyExp = ur_validation_layer::urUSMContextMemcpyExp; + dditable.pfnImportExp = pDdiTable->pfnImportExp; pDdiTable->pfnImportExp = ur_validation_layer::urUSMImportExp; diff --git a/unified-runtime/source/loader/loader.def.in b/unified-runtime/source/loader/loader.def.in index 8161d6a906fe6..0aa69405b8561 100644 --- a/unified-runtime/source/loader/loader.def.in +++ b/unified-runtime/source/loader/loader.def.in @@ -489,6 +489,7 @@ EXPORTS urPrintUsmAdviceFlags urPrintUsmAllocInfo urPrintUsmAllocLocationDesc + urPrintUsmContextMemcpyExpParams urPrintUsmDesc urPrintUsmDeviceAllocParams urPrintUsmDeviceDesc @@ -565,6 +566,7 @@ EXPORTS urSamplerGetNativeHandle urSamplerRelease urSamplerRetain + urUSMContextMemcpyExp urUSMDeviceAlloc urUSMFree urUSMGetMemAllocInfo diff --git a/unified-runtime/source/loader/loader.map.in b/unified-runtime/source/loader/loader.map.in index ee485f01fc4d9..42ea70f291c97 100644 --- a/unified-runtime/source/loader/loader.map.in +++ b/unified-runtime/source/loader/loader.map.in @@ -489,6 +489,7 @@ urPrintUsmAdviceFlags; urPrintUsmAllocInfo; urPrintUsmAllocLocationDesc; + urPrintUsmContextMemcpyExpParams; urPrintUsmDesc; urPrintUsmDeviceAllocParams; urPrintUsmDeviceDesc; @@ -565,6 +566,7 @@ urSamplerGetNativeHandle; urSamplerRelease; urSamplerRetain; + urUSMContextMemcpyExp; urUSMDeviceAlloc; urUSMFree; urUSMGetMemAllocInfo; diff --git a/unified-runtime/source/loader/ur_ldrddi.cpp b/unified-runtime/source/loader/ur_ldrddi.cpp index 4446c02902186..2ffd194b00766 100644 --- a/unified-runtime/source/loader/ur_ldrddi.cpp +++ b/unified-runtime/source/loader/ur_ldrddi.cpp @@ -5500,6 +5500,29 @@ __urdlllocal ur_result_t UR_APICALL urProgramLinkExp( pOptions, phProgram); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urUSMContextMemcpyExp +__urdlllocal ur_result_t UR_APICALL urUSMContextMemcpyExp( + /// [in] Context associated with the device(s) that own the allocations + /// `pSrc` and `pDst`. + ur_context_handle_t hContext, + /// [in] Destination pointer to copy to. + void *pDst, + /// [in] Source pointer to copy from. + const void *pSrc, + /// [in] Size in bytes to be copied. + size_t size) { + + auto *dditable = *reinterpret_cast(hContext); + + auto *pfnContextMemcpyExp = dditable->USMExp.pfnContextMemcpyExp; + if (nullptr == pfnContextMemcpyExp) + return UR_RESULT_ERROR_UNINITIALIZED; + + // forward to device-platform + return pfnContextMemcpyExp(hContext, pDst, pSrc, size); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urUSMImportExp __urdlllocal ur_result_t UR_APICALL urUSMImportExp( @@ -6797,6 +6820,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetUSMExpProcAddrTable( pDdiTable->pfnPoolGetDevicePoolExp = ur_loader::urUSMPoolGetDevicePoolExp; pDdiTable->pfnPoolTrimToExp = ur_loader::urUSMPoolTrimToExp; pDdiTable->pfnPitchedAllocExp = ur_loader::urUSMPitchedAllocExp; + pDdiTable->pfnContextMemcpyExp = ur_loader::urUSMContextMemcpyExp; pDdiTable->pfnImportExp = ur_loader::urUSMImportExp; pDdiTable->pfnReleaseExp = ur_loader::urUSMReleaseExp; } else { diff --git a/unified-runtime/source/loader/ur_libapi.cpp b/unified-runtime/source/loader/ur_libapi.cpp index 521f1bd016faa..ed14b29cd8393 100644 --- a/unified-runtime/source/loader/ur_libapi.cpp +++ b/unified-runtime/source/loader/ur_libapi.cpp @@ -926,7 +926,7 @@ ur_result_t UR_APICALL urDeviceGetSelected( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP < propName` +/// + `::UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -10105,6 +10105,45 @@ ur_result_t UR_APICALL urProgramLinkExp( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Perform a synchronous, blocking memcpy operation between USM +/// allocations. +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hContext` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pDst` +/// + `NULL == pSrc` +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// + `size == 0` +/// + If `size` is higher than the allocation size of `pSrc` or `pDst` +ur_result_t UR_APICALL urUSMContextMemcpyExp( + /// [in] Context associated with the device(s) that own the allocations + /// `pSrc` and `pDst`. + ur_context_handle_t hContext, + /// [in] Destination pointer to copy to. + void *pDst, + /// [in] Source pointer to copy from. + const void *pSrc, + /// [in] Size in bytes to be copied. + size_t size) try { + auto pfnContextMemcpyExp = + ur_lib::getContext()->urDdiTable.USMExp.pfnContextMemcpyExp; + if (nullptr == pfnContextMemcpyExp) + return UR_RESULT_ERROR_UNINITIALIZED; + + return pfnContextMemcpyExp(hContext, pDst, pSrc, size); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Import memory into USM /// diff --git a/unified-runtime/source/loader/ur_print.cpp b/unified-runtime/source/loader/ur_print.cpp index ec488519251e6..64a559ac36dbb 100644 --- a/unified-runtime/source/loader/ur_print.cpp +++ b/unified-runtime/source/loader/ur_print.cpp @@ -2788,6 +2788,14 @@ ur_result_t urPrintUsmPitchedAllocExpParams( return str_copy(&ss, buffer, buff_size, out_size); } +ur_result_t urPrintUsmContextMemcpyExpParams( + const struct ur_usm_context_memcpy_exp_params_t *params, char *buffer, + const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + ur_result_t urPrintUsmImportExpParams(const struct ur_usm_import_exp_params_t *params, char *buffer, const size_t buff_size, diff --git a/unified-runtime/source/ur_api.cpp b/unified-runtime/source/ur_api.cpp index f50417580244d..549bd97e87f6b 100644 --- a/unified-runtime/source/ur_api.cpp +++ b/unified-runtime/source/ur_api.cpp @@ -833,7 +833,7 @@ ur_result_t UR_APICALL urDeviceGetSelected( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP < propName` +/// + `::UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -8800,6 +8800,39 @@ ur_result_t UR_APICALL urProgramLinkExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Perform a synchronous, blocking memcpy operation between USM +/// allocations. +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hContext` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pDst` +/// + `NULL == pSrc` +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// + `size == 0` +/// + If `size` is higher than the allocation size of `pSrc` or `pDst` +ur_result_t UR_APICALL urUSMContextMemcpyExp( + /// [in] Context associated with the device(s) that own the allocations + /// `pSrc` and `pDst`. + ur_context_handle_t hContext, + /// [in] Destination pointer to copy to. + void *pDst, + /// [in] Source pointer to copy from. + const void *pSrc, + /// [in] Size in bytes to be copied. + size_t size) { + ur_result_t result = UR_RESULT_SUCCESS; + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Import memory into USM /// diff --git a/unified-runtime/test/conformance/CMakeLists.txt b/unified-runtime/test/conformance/CMakeLists.txt index 35e133143cf11..5d579dbbf506b 100644 --- a/unified-runtime/test/conformance/CMakeLists.txt +++ b/unified-runtime/test/conformance/CMakeLists.txt @@ -69,6 +69,7 @@ add_subdirectory(event) add_subdirectory(queue) add_subdirectory(sampler) add_subdirectory(virtual_memory) +add_subdirectory(exp_usm_context_memcpy) set(TEST_SUBDIRECTORIES_DPCXX "device_code" diff --git a/unified-runtime/test/conformance/exp_usm_context_memcpy/CMakeLists.txt b/unified-runtime/test/conformance/exp_usm_context_memcpy/CMakeLists.txt new file mode 100644 index 0000000000000..cd01ed84cef9c --- /dev/null +++ b/unified-runtime/test/conformance/exp_usm_context_memcpy/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (C) 2025 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +add_conformance_devices_test( + exp_usm_context_memcpy + urUSMContextMemcpyExp.cpp +) diff --git a/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp b/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp new file mode 100644 index 0000000000000..3c776cebc06de --- /dev/null +++ b/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp @@ -0,0 +1,164 @@ +// Copyright (C) 2025 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "uur/utils.h" +#include + +struct urUSMContextMemcpyExpTest : uur::urQueueTest { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(urQueueTest::SetUp()); + + bool context_memcpy_support = false; + ASSERT_SUCCESS( + uur::GetUSMContextMemcpyExpSupport(device, context_memcpy_support)); + if (!context_memcpy_support) { + GTEST_SKIP() << "urUSMContextMemcpyExp is not supported"; + } + } + + void TearDown() override { + if (src_ptr) { + EXPECT_SUCCESS(urUSMFree(context, src_ptr)); + } + if (dst_ptr) { + EXPECT_SUCCESS(urUSMFree(context, dst_ptr)); + } + + UUR_RETURN_ON_FATAL_FAILURE(urQueueTest::TearDown()); + } + + void initAllocations() { + ASSERT_SUCCESS(urEnqueueUSMFill(queue, src_ptr, sizeof(memset_src_value), + &memset_src_value, allocation_size, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urEnqueueUSMFill(queue, dst_ptr, sizeof(memset_dst_value), + &memset_dst_value, allocation_size, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + } + + void verifyData() { + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, true, &host_mem, dst_ptr, + allocation_size, 0, nullptr, nullptr)); + ASSERT_EQ(host_mem, memset_src_value); + } + + static constexpr size_t memset_src_value = 42; + static constexpr uint8_t memset_dst_value = 0; + static constexpr uint32_t allocation_size = sizeof(memset_src_value); + size_t host_mem = 0; + + void *src_ptr{nullptr}; + void *dst_ptr{nullptr}; +}; + +struct urUSMContextMemcpyExpTestDevice : urUSMContextMemcpyExpTest { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(urUSMContextMemcpyExpTest::SetUp()); + + ur_device_usm_access_capability_flags_t device_usm = 0; + ASSERT_SUCCESS(uur::GetDeviceUSMDeviceSupport(device, device_usm)); + if (!device_usm) { + GTEST_SKIP() << "Device USM is not supported"; + } + + ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr, + allocation_size, + reinterpret_cast(&src_ptr))); + ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr, + allocation_size, + reinterpret_cast(&dst_ptr))); + initAllocations(); + } +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE(urUSMContextMemcpyExpTestDevice); + +TEST_P(urUSMContextMemcpyExpTestDevice, Success) { + ASSERT_SUCCESS( + urUSMContextMemcpyExp(context, dst_ptr, src_ptr, allocation_size)); + verifyData(); +} + +// Arbitrarily do the negative tests with device allocations. These are mostly a +// test of the loader and validation layer anyway so no big deal if they don't +// run on all devices due to lack of support. +TEST_P(urUSMContextMemcpyExpTestDevice, InvalidNullContext) { + ASSERT_EQ_RESULT( + UR_RESULT_ERROR_INVALID_NULL_HANDLE, + urUSMContextMemcpyExp(nullptr, dst_ptr, src_ptr, allocation_size)); +} + +TEST_P(urUSMContextMemcpyExpTestDevice, InvalidNullPtrs) { + ASSERT_EQ_RESULT( + UR_RESULT_ERROR_INVALID_NULL_POINTER, + urUSMContextMemcpyExp(context, nullptr, src_ptr, allocation_size)); + ASSERT_EQ_RESULT( + UR_RESULT_ERROR_INVALID_NULL_POINTER, + urUSMContextMemcpyExp(context, dst_ptr, nullptr, allocation_size)); +} + +TEST_P(urUSMContextMemcpyExpTestDevice, InvalidZeroSize) { + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_SIZE, + urUSMContextMemcpyExp(context, dst_ptr, src_ptr, 0)); +} + +struct urUSMContextMemcpyExpTestHost : urUSMContextMemcpyExpTest { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(urUSMContextMemcpyExpTest::SetUp()); + + ur_device_usm_access_capability_flags_t host_usm = 0; + ASSERT_SUCCESS(uur::GetDeviceUSMHostSupport(device, host_usm)); + if (!host_usm) { + GTEST_SKIP() << "Host USM is not supported"; + } + + ASSERT_SUCCESS(urUSMHostAlloc(context, nullptr, nullptr, allocation_size, + reinterpret_cast(&src_ptr))); + ASSERT_SUCCESS(urUSMHostAlloc(context, nullptr, nullptr, allocation_size, + reinterpret_cast(&dst_ptr))); + initAllocations(); + } +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE(urUSMContextMemcpyExpTestHost); + +TEST_P(urUSMContextMemcpyExpTestHost, Success) { + ASSERT_SUCCESS( + urUSMContextMemcpyExp(context, dst_ptr, src_ptr, allocation_size)); + verifyData(); +} + +struct urUSMContextMemcpyExpTestShared : urUSMContextMemcpyExpTest { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(urUSMContextMemcpyExpTest::SetUp()); + + ur_device_usm_access_capability_flags_t shared_usm_single = 0; + + ASSERT_SUCCESS( + uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_single)); + + if (!shared_usm_single) { + GTEST_SKIP() << "Shared USM is not supported by the device."; + } + + ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, + allocation_size, + reinterpret_cast(&src_ptr))); + ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, + allocation_size, + reinterpret_cast(&dst_ptr))); + initAllocations(); + } +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE(urUSMContextMemcpyExpTestShared); + +TEST_P(urUSMContextMemcpyExpTestShared, Success) { + ASSERT_SUCCESS( + urUSMContextMemcpyExp(context, dst_ptr, src_ptr, allocation_size)); + verifyData(); +} diff --git a/unified-runtime/test/conformance/testing/include/uur/utils.h b/unified-runtime/test/conformance/testing/include/uur/utils.h index 9710dd948e28e..e6b5ef604ec04 100644 --- a/unified-runtime/test/conformance/testing/include/uur/utils.h +++ b/unified-runtime/test/conformance/testing/include/uur/utils.h @@ -414,6 +414,8 @@ ur_result_t GetDeviceHostPipeRWSupported(ur_device_handle_t device, bool &support); ur_result_t GetTimestampRecordingSupport(ur_device_handle_t device, bool &support); +ur_result_t GetUSMContextMemcpyExpSupport(ur_device_handle_t device, + bool &support); ur_device_partition_property_t makePartitionByCountsDesc(uint32_t count); diff --git a/unified-runtime/test/conformance/testing/source/utils.cpp b/unified-runtime/test/conformance/testing/source/utils.cpp index 7ff55963a3161..51fc6374f7a48 100644 --- a/unified-runtime/test/conformance/testing/source/utils.cpp +++ b/unified-runtime/test/conformance/testing/source/utils.cpp @@ -643,6 +643,12 @@ ur_result_t GetTimestampRecordingSupport(ur_device_handle_t device, device, UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP, support); } +ur_result_t GetUSMContextMemcpyExpSupport(ur_device_handle_t device, + bool &support) { + return GetDeviceInfo( + device, UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP, support); +} + ur_device_partition_property_t makePartitionByCountsDesc(uint32_t count) { ur_device_partition_property_t desc; desc.type = UR_DEVICE_PARTITION_BY_COUNTS; diff --git a/unified-runtime/tools/urinfo/urinfo.hpp b/unified-runtime/tools/urinfo/urinfo.hpp index 49618c4fe82e8..d099c6365b623 100644 --- a/unified-runtime/tools/urinfo/urinfo.hpp +++ b/unified-runtime/tools/urinfo/urinfo.hpp @@ -448,5 +448,8 @@ inline void printDeviceInfos(ur_device_handle_t hDevice, std::cout << prefix; printDeviceInfo(hDevice, UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP); + std::cout << prefix; + printDeviceInfo(hDevice, + UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP); } } // namespace urinfo