From db41224a76b58cdc65137c8c33edbc4dd95508d9 Mon Sep 17 00:00:00 2001 From: Aaron Greig Date: Mon, 3 Mar 2025 10:15:33 +0000 Subject: [PATCH 01/14] [UR] Add urUSMContextMemcpyExp API and basic l0 implementation. --- unified-runtime/include/ur_api.h | 50 ++++++++++++++- unified-runtime/include/ur_api_funcs.def | 1 + unified-runtime/include/ur_ddi.h | 6 ++ unified-runtime/include/ur_print.h | 10 +++ unified-runtime/include/ur_print.hpp | 52 +++++++++++++++ .../scripts/core/EXP-USM-CONTEXT-MEMCPY.rst | 63 +++++++++++++++++++ .../scripts/core/exp-usm-context-memcpy.yml | 45 +++++++++++++ unified-runtime/scripts/core/registry.yml | 3 + .../source/adapters/cuda/device.cpp | 2 + .../adapters/cuda/ur_interface_loader.cpp | 1 + unified-runtime/source/adapters/cuda/usm.cpp | 6 ++ .../source/adapters/hip/device.cpp | 5 +- .../adapters/hip/ur_interface_loader.cpp | 1 + unified-runtime/source/adapters/hip/usm.cpp | 6 ++ .../source/adapters/level_zero/device.cpp | 2 + .../level_zero/ur_interface_loader.cpp | 1 + .../level_zero/ur_interface_loader.hpp | 2 + .../source/adapters/level_zero/usm.cpp | 10 +++ .../source/adapters/mock/ur_mockddi.cpp | 51 +++++++++++++++ .../source/adapters/native_cpu/device.cpp | 2 + .../native_cpu/ur_interface_loader.cpp | 1 + .../source/adapters/native_cpu/usm.cpp | 6 ++ .../source/adapters/opencl/device.cpp | 1 + .../adapters/opencl/ur_interface_loader.cpp | 1 + .../source/adapters/opencl/usm.cpp | 6 ++ .../loader/layers/tracing/ur_trcddi.cpp | 44 +++++++++++++ .../loader/layers/validation/ur_valddi.cpp | 45 ++++++++++++- unified-runtime/source/loader/loader.def.in | 2 + unified-runtime/source/loader/loader.map.in | 2 + unified-runtime/source/loader/ur_ldrddi.cpp | 32 ++++++++++ unified-runtime/source/loader/ur_libapi.cpp | 37 ++++++++++- unified-runtime/source/loader/ur_print.cpp | 8 +++ unified-runtime/source/ur_api.cpp | 31 ++++++++- unified-runtime/tools/urinfo/urinfo.hpp | 3 + 34 files changed, 532 insertions(+), 6 deletions(-) create mode 100644 unified-runtime/scripts/core/EXP-USM-CONTEXT-MEMCPY.rst create mode 100644 unified-runtime/scripts/core/exp-usm-context-memcpy.yml diff --git a/unified-runtime/include/ur_api.h b/unified-runtime/include/ur_api.h index c4216f9891f3b..ad47e66be1df2 100644 --- a/unified-runtime/include/ur_api.h +++ b/unified-runtime/include/ur_api.h @@ -453,6 +453,8 @@ typedef enum ur_function_t { UR_FUNCTION_USM_POOL_TRIM_TO_EXP = 261, /// Enumerator for ::urUSMPoolGetInfoExp UR_FUNCTION_USM_POOL_GET_INFO_EXP = 262, + /// Enumerator for ::urUSMContextMemcpyExp + UR_FUNCTION_USM_CONTEXT_MEMCPY_EXP = 264, /// @cond UR_FUNCTION_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -2320,6 +2322,8 @@ typedef enum ur_device_info_t { /// [::ur_bool_t] returns true if the device supports enqueueing of /// allocations and frees. UR_DEVICE_INFO_ASYNC_USM_ALLOCATIONS_EXP = 0x2050, + /// [::ur_bool_t] returns true if the device supports ::urUSMContextMemcpy + UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP = 0x7000, /// @cond UR_DEVICE_INFO_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -2345,7 +2349,7 @@ typedef enum ur_device_info_t { /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_DEVICE_INFO_ASYNC_USM_ALLOCATIONS_EXP < propName` +/// + `::UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -12153,6 +12157,39 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramLinkExp( /// [out][alloc] pointer to handle of program object created. ur_program_handle_t *phProgram); +#if !defined(__GNUC__) +#pragma endregion +#endif +// Intel 'oneAPI' Unified Runtime Experimental APIs for USM Context Memcpy +#if !defined(__GNUC__) +#pragma region usm_context_memcpy_(experimental) +#endif +/////////////////////////////////////////////////////////////////////////////// +/// @brief Enable access to peer device memory +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hContext` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pDst` +/// + `NULL == pSrc` +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +UR_APIEXPORT ur_result_t UR_APICALL urUSMContextMemcpyExp( + /// [in] Context associated with the device(s) that own the allocations + /// `pSrc` and `pDst`. + ur_context_handle_t hContext, + /// [in] Destination pointer to copy to. + void *pDst, + /// [in] Source pointer to copy from. + const void *pSrc, + /// [in] Size in bytes to be copied. + size_t size); + #if !defined(__GNUC__) #pragma endregion #endif @@ -14282,6 +14319,17 @@ typedef struct ur_usm_pitched_alloc_exp_params_t { size_t **ppResultPitch; } ur_usm_pitched_alloc_exp_params_t; +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function parameters for urUSMContextMemcpyExp +/// @details Each entry is a pointer to the parameter passed to the function; +/// allowing the callback the ability to modify the parameter's value +typedef struct ur_usm_context_memcpy_exp_params_t { + ur_context_handle_t *phContext; + void **ppDst; + const void **ppSrc; + size_t *psize; +} ur_usm_context_memcpy_exp_params_t; + /////////////////////////////////////////////////////////////////////////////// /// @brief Function parameters for urUSMImportExp /// @details Each entry is a pointer to the parameter passed to the function; diff --git a/unified-runtime/include/ur_api_funcs.def b/unified-runtime/include/ur_api_funcs.def index 97adbfcf29c85..def1fb95a8e57 100644 --- a/unified-runtime/include/ur_api_funcs.def +++ b/unified-runtime/include/ur_api_funcs.def @@ -157,6 +157,7 @@ _UR_API(urUSMPoolSetDevicePoolExp) _UR_API(urUSMPoolGetDevicePoolExp) _UR_API(urUSMPoolTrimToExp) _UR_API(urUSMPitchedAllocExp) +_UR_API(urUSMContextMemcpyExp) _UR_API(urUSMImportExp) _UR_API(urUSMReleaseExp) _UR_API(urBindlessImagesUnsampledImageHandleDestroyExp) diff --git a/unified-runtime/include/ur_ddi.h b/unified-runtime/include/ur_ddi.h index 0f4a6d685fea6..cc7632b2edd95 100644 --- a/unified-runtime/include/ur_ddi.h +++ b/unified-runtime/include/ur_ddi.h @@ -1338,6 +1338,11 @@ typedef ur_result_t(UR_APICALL *ur_pfnUSMPitchedAllocExp_t)( ur_context_handle_t, ur_device_handle_t, const ur_usm_desc_t *, ur_usm_pool_handle_t, size_t, size_t, size_t, void **, size_t *); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function-pointer for urUSMContextMemcpyExp +typedef ur_result_t(UR_APICALL *ur_pfnUSMContextMemcpyExp_t)( + ur_context_handle_t, void *, const void *, size_t); + /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urUSMImportExp typedef ur_result_t(UR_APICALL *ur_pfnUSMImportExp_t)(ur_context_handle_t, @@ -1360,6 +1365,7 @@ typedef struct ur_usm_exp_dditable_t { ur_pfnUSMPoolGetDevicePoolExp_t pfnPoolGetDevicePoolExp; ur_pfnUSMPoolTrimToExp_t pfnPoolTrimToExp; ur_pfnUSMPitchedAllocExp_t pfnPitchedAllocExp; + ur_pfnUSMContextMemcpyExp_t pfnContextMemcpyExp; ur_pfnUSMImportExp_t pfnImportExp; ur_pfnUSMReleaseExp_t pfnReleaseExp; } ur_usm_exp_dditable_t; diff --git a/unified-runtime/include/ur_print.h b/unified-runtime/include/ur_print.h index 1b4f625b99925..05970fd5a3374 100644 --- a/unified-runtime/include/ur_print.h +++ b/unified-runtime/include/ur_print.h @@ -2897,6 +2897,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintUsmPitchedAllocExpParams( const struct ur_usm_pitched_alloc_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_usm_context_memcpy_exp_params_t struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintUsmContextMemcpyExpParams( + const struct ur_usm_context_memcpy_exp_params_t *params, char *buffer, + const size_t buff_size, size_t *out_size); + /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_usm_import_exp_params_t struct /// @returns diff --git a/unified-runtime/include/ur_print.hpp b/unified-runtime/include/ur_print.hpp index d35d42a987f5e..2b49547ea67a0 100644 --- a/unified-runtime/include/ur_print.hpp +++ b/unified-runtime/include/ur_print.hpp @@ -1222,6 +1222,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_USM_POOL_GET_INFO_EXP: os << "UR_FUNCTION_USM_POOL_GET_INFO_EXP"; break; + case UR_FUNCTION_USM_CONTEXT_MEMCPY_EXP: + os << "UR_FUNCTION_USM_CONTEXT_MEMCPY_EXP"; + break; default: os << "unknown enumerator"; break; @@ -3032,6 +3035,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_device_info_t value) { case UR_DEVICE_INFO_ASYNC_USM_ALLOCATIONS_EXP: os << "UR_DEVICE_INFO_ASYNC_USM_ALLOCATIONS_EXP"; break; + case UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP: + os << "UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP"; + break; default: os << "unknown enumerator"; break; @@ -5036,6 +5042,19 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, os << ")"; } break; + case UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP: { + const ur_bool_t *tptr = (const ur_bool_t *)ptr; + if (sizeof(ur_bool_t) > size) { + os << "invalid size (is: " << size + << ", expected: >=" << sizeof(ur_bool_t) << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + os << *tptr; + + os << ")"; + } break; default: os << "unknown enumerator"; return UR_RESULT_ERROR_INVALID_ENUMERATION; @@ -17459,6 +17478,36 @@ inline std::ostream &operator<<( return os; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_usm_context_memcpy_exp_params_t type +/// @returns +/// std::ostream & +inline std::ostream &operator<<( + std::ostream &os, + [[maybe_unused]] const struct ur_usm_context_memcpy_exp_params_t *params) { + + os << ".hContext = "; + + ur::details::printPtr(os, *(params->phContext)); + + os << ", "; + os << ".pDst = "; + + ur::details::printPtr(os, *(params->ppDst)); + + os << ", "; + os << ".pSrc = "; + + ur::details::printPtr(os, *(params->ppSrc)); + + os << ", "; + os << ".size = "; + + os << *(params->psize); + + return os; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_usm_import_exp_params_t type /// @returns @@ -20708,6 +20757,9 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, case UR_FUNCTION_USM_PITCHED_ALLOC_EXP: { os << (const struct ur_usm_pitched_alloc_exp_params_t *)params; } break; + case UR_FUNCTION_USM_CONTEXT_MEMCPY_EXP: { + os << (const struct ur_usm_context_memcpy_exp_params_t *)params; + } break; case UR_FUNCTION_USM_IMPORT_EXP: { os << (const struct ur_usm_import_exp_params_t *)params; } break; diff --git a/unified-runtime/scripts/core/EXP-USM-CONTEXT-MEMCPY.rst b/unified-runtime/scripts/core/EXP-USM-CONTEXT-MEMCPY.rst new file mode 100644 index 0000000000000..9af1999deeb25 --- /dev/null +++ b/unified-runtime/scripts/core/EXP-USM-CONTEXT-MEMCPY.rst @@ -0,0 +1,63 @@ +<% + OneApi=tags['$OneApi'] + x=tags['$x'] + X=x.upper() +%> + +.. _experimental-usm-context-memcpy: + +================================================================================ +USM Context Memcpy +================================================================================ + +.. warning:: + + Experimental features: + + * May be replaced, updated, or removed at any time. + * Do not require maintaining API/ABI stability of their own additions over + time. + * Do not require conformance testing of their own additions. + + +Motivation +-------------------------------------------------------------------------------- + +In order to support device globals there's a need for a blocking USM write +operation that doesn't need a queue. This is to facilitate fast initialization +of the device global memory via native APIs that enable this kind of operation. + +API +-------------------------------------------------------------------------------- + +Enums +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* ${x}_device_info_t + * ${X}_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP + +Functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +* ${x}USMContextMemcpyExp + +Changelog +-------------------------------------------------------------------------------- + ++-----------+---------------------------+ +| Revision | Changes | ++===========+===========================+ +| 1.0 | Initial Draft | ++-----------+---------------------------+ + + +Support +-------------------------------------------------------------------------------- + +Adapters which support this experimental feature *must* return true for the new +``${X}_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP`` device info query. + + +Contributors +-------------------------------------------------------------------------------- + +* Aaron Greig `aaron.greig@codeplay.com ` diff --git a/unified-runtime/scripts/core/exp-usm-context-memcpy.yml b/unified-runtime/scripts/core/exp-usm-context-memcpy.yml new file mode 100644 index 0000000000000..fd80416afa22f --- /dev/null +++ b/unified-runtime/scripts/core/exp-usm-context-memcpy.yml @@ -0,0 +1,45 @@ +# +# Copyright (C) 2025 Intel Corporation +# +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# See YaML.md for syntax definition +# +--- #-------------------------------------------------------------------------- +type: header +desc: "Intel $OneApi Unified Runtime Experimental APIs for USM Context Memcpy" +ordinal: "99" +--- #-------------------------------------------------------------------------- +type: enum +extend: true +typed_etors: true +desc: "Extension enums to $x_device_info_t to support $xUSMContextMemcpy" +name: $x_device_info_t +etors: + - name: USM_CONTEXT_MEMCPY_SUPPORT_EXP + value: "0x7000" + desc: "[$x_bool_t] returns true if the device supports $xUSMContextMemcpyExp" +--- #-------------------------------------------------------------------------- +type: function +desc: "Perform a synchronous, blocking memcpy operation between USM allocations." +class: $xUSM +name: ContextMemcpyExp +ordinal: "0" +params: + - type: $x_context_handle_t + name: hContext + desc: "[in] Context associated with the device(s) that own the allocations `pSrc` and `pDst`." + - type: void* + name: pDst + desc: "[in] Destination pointer to copy to." + - type: const void* + name: pSrc + desc: "[in] Source pointer to copy from." + - type: size_t + name: size + desc: "[in] Size in bytes to be copied." +returns: + - $X_RESULT_SUCCESS + - $X_RESULT_ERROR_ADAPTER_SPECIFIC diff --git a/unified-runtime/scripts/core/registry.yml b/unified-runtime/scripts/core/registry.yml index e37d7fef27092..716a403b3d9a1 100644 --- a/unified-runtime/scripts/core/registry.yml +++ b/unified-runtime/scripts/core/registry.yml @@ -637,6 +637,9 @@ etors: - name: USM_POOL_GET_INFO_EXP desc: Enumerator for $xUSMPoolGetInfoExp value: '262' +- name: USM_CONTEXT_MEMCPY_EXP + desc: Enumerator for $xUSMContextMemcpyExp + value: '264' --- type: enum desc: Defines structure types diff --git a/unified-runtime/source/adapters/cuda/device.cpp b/unified-runtime/source/adapters/cuda/device.cpp index 9ce931a77748a..997a93385a446 100644 --- a/unified-runtime/source/adapters/cuda/device.cpp +++ b/unified-runtime/source/adapters/cuda/device.cpp @@ -1118,6 +1118,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, } case UR_DEVICE_INFO_LOW_POWER_EVENTS_EXP: return ReturnValue(false); + case UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP: + return ReturnValue(false); default: break; } diff --git a/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp b/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp index 1e375ecc20273..0bac655bae06e 100644 --- a/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp @@ -369,6 +369,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetUSMExpProcAddrTable( pDdiTable->pfnPoolSetDevicePoolExp = urUSMPoolSetDevicePoolExp; pDdiTable->pfnPoolGetDevicePoolExp = urUSMPoolGetDevicePoolExp; pDdiTable->pfnPoolTrimToExp = urUSMPoolTrimToExp; + pDdiTable->pfnContextMemcpyExp = urUSMContextMemcpyExp; return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/cuda/usm.cpp b/unified-runtime/source/adapters/cuda/usm.cpp index 5ee1ee5aecbae..3e6ac66414a2e 100644 --- a/unified-runtime/source/adapters/cuda/usm.cpp +++ b/unified-runtime/source/adapters/cuda/usm.cpp @@ -459,3 +459,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolTrimToExp(ur_context_handle_t, size_t) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + +UR_APIEXPORT ur_result_t UR_APICALL urUSMContextMemcpyExp(ur_context_handle_t, + void *, const void *, + size_t) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/unified-runtime/source/adapters/hip/device.cpp b/unified-runtime/source/adapters/hip/device.cpp index d8b60e186649f..f027c57b89c1e 100644 --- a/unified-runtime/source/adapters/hip/device.cpp +++ b/unified-runtime/source/adapters/hip/device.cpp @@ -1104,9 +1104,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, } case UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP: return ReturnValue(false); - case UR_DEVICE_INFO_LOW_POWER_EVENTS_EXP: { + case UR_DEVICE_INFO_LOW_POWER_EVENTS_EXP: + return ReturnValue(false); + case UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP: return ReturnValue(false); - } default: break; } diff --git a/unified-runtime/source/adapters/hip/ur_interface_loader.cpp b/unified-runtime/source/adapters/hip/ur_interface_loader.cpp index fc2549c4d9ea2..41c8f54010784 100644 --- a/unified-runtime/source/adapters/hip/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/hip/ur_interface_loader.cpp @@ -367,6 +367,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetUSMExpProcAddrTable( pDdiTable->pfnPoolSetDevicePoolExp = urUSMPoolSetDevicePoolExp; pDdiTable->pfnPoolGetDevicePoolExp = urUSMPoolGetDevicePoolExp; pDdiTable->pfnPoolTrimToExp = urUSMPoolTrimToExp; + pDdiTable->pfnContextMemcpyExp = urUSMContextMemcpyExp; return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/hip/usm.cpp b/unified-runtime/source/adapters/hip/usm.cpp index 7412d4b1eb8b2..fd3ffa72e2886 100644 --- a/unified-runtime/source/adapters/hip/usm.cpp +++ b/unified-runtime/source/adapters/hip/usm.cpp @@ -524,3 +524,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolTrimToExp(ur_context_handle_t, size_t) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + +UR_APIEXPORT ur_result_t UR_APICALL urUSMContextMemcpyExp(ur_context_handle_t, + void *, const void *, + size_t) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/unified-runtime/source/adapters/level_zero/device.cpp b/unified-runtime/source/adapters/level_zero/device.cpp index bd637dcc45678..a635f6de8b0a9 100644 --- a/unified-runtime/source/adapters/level_zero/device.cpp +++ b/unified-runtime/source/adapters/level_zero/device.cpp @@ -1210,6 +1210,8 @@ ur_result_t urDeviceGetInfo( return ReturnValue(false); case UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED: return ReturnValue(false); + case UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP: + return ReturnValue(true); default: logger::error("Unsupported ParamName in urGetDeviceInfo"); logger::error("ParamNameParamName={}(0x{})", ParamName, diff --git a/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp b/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp index c791027efdbc8..beadb45306256 100644 --- a/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp @@ -462,6 +462,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urGetUSMExpProcAddrTable( ur::level_zero::urUSMPoolGetDevicePoolExp; pDdiTable->pfnPoolTrimToExp = ur::level_zero::urUSMPoolTrimToExp; pDdiTable->pfnPitchedAllocExp = ur::level_zero::urUSMPitchedAllocExp; + pDdiTable->pfnContextMemcpyExp = ur::level_zero::urUSMContextMemcpyExp; pDdiTable->pfnImportExp = ur::level_zero::urUSMImportExp; pDdiTable->pfnReleaseExp = ur::level_zero::urUSMReleaseExp; diff --git a/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp b/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp index e794973d84958..5ecc8af1c2e93 100644 --- a/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp +++ b/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp @@ -756,6 +756,8 @@ ur_result_t urProgramLinkExp(ur_context_handle_t hContext, uint32_t numDevices, const ur_program_handle_t *phPrograms, const char *pOptions, ur_program_handle_t *phProgram); +ur_result_t urUSMContextMemcpyExp(ur_context_handle_t hContext, void *pDst, + const void *pSrc, size_t size); ur_result_t urUSMImportExp(ur_context_handle_t hContext, void *pMem, size_t size); ur_result_t urUSMReleaseExp(ur_context_handle_t hContext, void *pMem); diff --git a/unified-runtime/source/adapters/level_zero/usm.cpp b/unified-runtime/source/adapters/level_zero/usm.cpp index ae3a693d20981..775f3a0686e03 100644 --- a/unified-runtime/source/adapters/level_zero/usm.cpp +++ b/unified-runtime/source/adapters/level_zero/usm.cpp @@ -851,6 +851,16 @@ ur_result_t UR_APICALL urUSMPoolTrimToExp(ur_context_handle_t, ur_usm_pool_handle_t, size_t) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + +UR_APIEXPORT ur_result_t UR_APICALL urUSMContextMemcpyExp( + ur_context_handle_t Context, void *pDst, const void *pSrc, size_t Size) { + // zeCommandListAppendMemoryCopy must not be called from simultaneous + // threads with the same command list handle, so we need exclusive lock. + std::scoped_lock Lock(Context->ImmediateCommandListMutex); + ZE2UR_CALL(zeCommandListAppendMemoryCopy, (Context->ZeCommandListInit, pDst, + pSrc, Size, nullptr, 0, nullptr)); + return UR_RESULT_SUCCESS; +} } // namespace ur::level_zero static ur_result_t USMFreeImpl(ur_context_handle_t Context, void *Ptr) { diff --git a/unified-runtime/source/adapters/mock/ur_mockddi.cpp b/unified-runtime/source/adapters/mock/ur_mockddi.cpp index eb85b9ed09d54..0dd087fc3adf7 100644 --- a/unified-runtime/source/adapters/mock/ur_mockddi.cpp +++ b/unified-runtime/source/adapters/mock/ur_mockddi.cpp @@ -11136,6 +11136,55 @@ __urdlllocal ur_result_t UR_APICALL urProgramLinkExp( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urUSMContextMemcpyExp +__urdlllocal ur_result_t UR_APICALL urUSMContextMemcpyExp( + /// [in] Context associated with the device(s) that own the allocations + /// `pSrc` and `pDst`. + ur_context_handle_t hContext, + /// [in] Destination pointer to copy to. + void *pDst, + /// [in] Source pointer to copy from. + const void *pSrc, + /// [in] Size in bytes to be copied. + size_t size) try { + ur_result_t result = UR_RESULT_SUCCESS; + + ur_usm_context_memcpy_exp_params_t params = {&hContext, &pDst, &pSrc, &size}; + + auto beforeCallback = reinterpret_cast( + mock::getCallbacks().get_before_callback("urUSMContextMemcpyExp")); + if (beforeCallback) { + result = beforeCallback(¶ms); + if (result != UR_RESULT_SUCCESS) { + return result; + } + } + + auto replaceCallback = reinterpret_cast( + mock::getCallbacks().get_replace_callback("urUSMContextMemcpyExp")); + if (replaceCallback) { + result = replaceCallback(¶ms); + } else { + + result = UR_RESULT_SUCCESS; + } + + if (result != UR_RESULT_SUCCESS) { + return result; + } + + auto afterCallback = reinterpret_cast( + mock::getCallbacks().get_after_callback("urUSMContextMemcpyExp")); + if (afterCallback) { + return afterCallback(¶ms); + } + + return result; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urUSMImportExp __urdlllocal ur_result_t UR_APICALL urUSMImportExp( @@ -12390,6 +12439,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetUSMExpProcAddrTable( pDdiTable->pfnPitchedAllocExp = driver::urUSMPitchedAllocExp; + pDdiTable->pfnContextMemcpyExp = driver::urUSMContextMemcpyExp; + pDdiTable->pfnImportExp = driver::urUSMImportExp; pDdiTable->pfnReleaseExp = driver::urUSMReleaseExp; diff --git a/unified-runtime/source/adapters/native_cpu/device.cpp b/unified-runtime/source/adapters/native_cpu/device.cpp index de579301954e7..fba7fb8cb6cc8 100644 --- a/unified-runtime/source/adapters/native_cpu/device.cpp +++ b/unified-runtime/source/adapters/native_cpu/device.cpp @@ -437,6 +437,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_GLOBAL_VARIABLE_SUPPORT: return ReturnValue(false); + case UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP: + return ReturnValue(false); default: DIE_NO_IMPLEMENTATION; } diff --git a/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp b/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp index d94380e40572d..4288c0a6490c1 100644 --- a/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp @@ -367,6 +367,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetUSMExpProcAddrTable( return result; } pDdiTable->pfnPitchedAllocExp = urUSMPitchedAllocExp; + pDdiTable->pfnContextMemcpyExp = urUSMContextMemcpyExp; return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/native_cpu/usm.cpp b/unified-runtime/source/adapters/native_cpu/usm.cpp index 6185178df321c..d6ad200a190bb 100644 --- a/unified-runtime/source/adapters/native_cpu/usm.cpp +++ b/unified-runtime/source/adapters/native_cpu/usm.cpp @@ -201,3 +201,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolTrimToExp(ur_context_handle_t, size_t) { DIE_NO_IMPLEMENTATION; } + +UR_APIEXPORT ur_result_t UR_APICALL urUSMContextMemcpyExp(ur_context_handle_t, + void *, const void *, + size_t) { + DIE_NO_IMPLEMENTATION; +} diff --git a/unified-runtime/source/adapters/opencl/device.cpp b/unified-runtime/source/adapters/opencl/device.cpp index a71b0068d3908..2d1feb17c6f0b 100644 --- a/unified-runtime/source/adapters/opencl/device.cpp +++ b/unified-runtime/source/adapters/opencl/device.cpp @@ -1587,6 +1587,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_BINDLESS_UNIQUE_ADDRESSING_PER_DIM_EXP: case UR_DEVICE_INFO_BINDLESS_SAMPLE_1D_USM_EXP: case UR_DEVICE_INFO_BINDLESS_SAMPLE_2D_USM_EXP: + case UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP: return ReturnValue(false); case UR_DEVICE_INFO_IMAGE_PITCH_ALIGN_EXP: case UR_DEVICE_INFO_MAX_IMAGE_LINEAR_WIDTH_EXP: diff --git a/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp b/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp index 44ba406e009fd..64233a41149cc 100644 --- a/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp @@ -258,6 +258,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetUSMExpProcAddrTable( pDdiTable->pfnImportExp = urUSMImportExp; pDdiTable->pfnReleaseExp = urUSMReleaseExp; + pDdiTable->pfnContextMemcpyExp = urUSMContextMemcpyExp; return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/opencl/usm.cpp b/unified-runtime/source/adapters/opencl/usm.cpp index e708a19b73dda..f9cda1c1d4691 100644 --- a/unified-runtime/source/adapters/opencl/usm.cpp +++ b/unified-runtime/source/adapters/opencl/usm.cpp @@ -774,3 +774,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolTrimToExp(ur_context_handle_t, size_t) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + +UR_APIEXPORT ur_result_t UR_APICALL urUSMContextMemcpyExp(ur_context_handle_t, + const void *, void *, + size_t) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp index 653db12cf10d6..8dfb9a150e79e 100644 --- a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp +++ b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp @@ -9311,6 +9311,47 @@ __urdlllocal ur_result_t UR_APICALL urProgramLinkExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urUSMContextMemcpyExp +__urdlllocal ur_result_t UR_APICALL urUSMContextMemcpyExp( + /// [in] Context associated with the device(s) that own the allocations + /// `pSrc` and `pDst`. + ur_context_handle_t hContext, + /// [in] Destination pointer to copy to. + void *pDst, + /// [in] Source pointer to copy from. + const void *pSrc, + /// [in] Size in bytes to be copied. + size_t size) { + auto pfnContextMemcpyExp = + getContext()->urDdiTable.USMExp.pfnContextMemcpyExp; + + if (nullptr == pfnContextMemcpyExp) + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + + ur_usm_context_memcpy_exp_params_t params = {&hContext, &pDst, &pSrc, &size}; + uint64_t instance = getContext()->notify_begin( + UR_FUNCTION_USM_CONTEXT_MEMCPY_EXP, "urUSMContextMemcpyExp", ¶ms); + + auto &logger = getContext()->logger; + logger.info(" ---> urUSMContextMemcpyExp\n"); + + ur_result_t result = pfnContextMemcpyExp(hContext, pDst, pSrc, size); + + getContext()->notify_end(UR_FUNCTION_USM_CONTEXT_MEMCPY_EXP, + "urUSMContextMemcpyExp", ¶ms, &result, instance); + + if (logger.getLevel() <= logger::Level::INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_USM_CONTEXT_MEMCPY_EXP, ¶ms); + logger.info(" <--- urUSMContextMemcpyExp({}) -> {};\n", args_str.str(), + result); + } + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urUSMImportExp __urdlllocal ur_result_t UR_APICALL urUSMImportExp( @@ -10758,6 +10799,9 @@ __urdlllocal ur_result_t UR_APICALL urGetUSMExpProcAddrTable( dditable.pfnPitchedAllocExp = pDdiTable->pfnPitchedAllocExp; pDdiTable->pfnPitchedAllocExp = ur_tracing_layer::urUSMPitchedAllocExp; + dditable.pfnContextMemcpyExp = pDdiTable->pfnContextMemcpyExp; + pDdiTable->pfnContextMemcpyExp = ur_tracing_layer::urUSMContextMemcpyExp; + dditable.pfnImportExp = pDdiTable->pfnImportExp; pDdiTable->pfnImportExp = ur_tracing_layer::urUSMImportExp; diff --git a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp index ffcd6e35e5359..aef744339ed09 100644 --- a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp +++ b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp @@ -496,7 +496,7 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetInfo( if (pPropValue == NULL && pPropSizeRet == NULL) return UR_RESULT_ERROR_INVALID_NULL_POINTER; - if (UR_DEVICE_INFO_ASYNC_USM_ALLOCATIONS_EXP < propName) + if (UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP < propName) return UR_RESULT_ERROR_INVALID_ENUMERATION; if (propSize == 0 && pPropValue != NULL) @@ -10031,6 +10031,46 @@ __urdlllocal ur_result_t UR_APICALL urProgramLinkExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urUSMContextMemcpyExp +__urdlllocal ur_result_t UR_APICALL urUSMContextMemcpyExp( + /// [in] Context associated with the device(s) that own the allocations + /// `pSrc` and `pDst`. + ur_context_handle_t hContext, + /// [in] Destination pointer to copy to. + void *pDst, + /// [in] Source pointer to copy from. + const void *pSrc, + /// [in] Size in bytes to be copied. + size_t size) { + auto pfnContextMemcpyExp = + getContext()->urDdiTable.USMExp.pfnContextMemcpyExp; + + if (nullptr == pfnContextMemcpyExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + if (getContext()->enableParameterValidation) { + if (NULL == hContext) + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + + if (NULL == pDst) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + + if (NULL == pSrc) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + + if (getContext()->enableLifetimeValidation && + !getContext()->refCountContext->isReferenceValid(hContext)) { + getContext()->refCountContext->logInvalidReference(hContext); + } + + ur_result_t result = pfnContextMemcpyExp(hContext, pDst, pSrc, size); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urUSMImportExp __urdlllocal ur_result_t UR_APICALL urUSMImportExp( @@ -11510,6 +11550,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetUSMExpProcAddrTable( dditable.pfnPitchedAllocExp = pDdiTable->pfnPitchedAllocExp; pDdiTable->pfnPitchedAllocExp = ur_validation_layer::urUSMPitchedAllocExp; + dditable.pfnContextMemcpyExp = pDdiTable->pfnContextMemcpyExp; + pDdiTable->pfnContextMemcpyExp = ur_validation_layer::urUSMContextMemcpyExp; + dditable.pfnImportExp = pDdiTable->pfnImportExp; pDdiTable->pfnImportExp = ur_validation_layer::urUSMImportExp; diff --git a/unified-runtime/source/loader/loader.def.in b/unified-runtime/source/loader/loader.def.in index 6f1d487a17f1f..88af20dd03c3d 100644 --- a/unified-runtime/source/loader/loader.def.in +++ b/unified-runtime/source/loader/loader.def.in @@ -475,6 +475,7 @@ EXPORTS urPrintUsmAdviceFlags urPrintUsmAllocInfo urPrintUsmAllocLocationDesc + urPrintUsmContextMemcpyExpParams urPrintUsmDesc urPrintUsmDeviceAllocParams urPrintUsmDeviceDesc @@ -550,6 +551,7 @@ EXPORTS urSamplerGetNativeHandle urSamplerRelease urSamplerRetain + urUSMContextMemcpyExp urUSMDeviceAlloc urUSMFree urUSMGetMemAllocInfo diff --git a/unified-runtime/source/loader/loader.map.in b/unified-runtime/source/loader/loader.map.in index 5ba9b9123d50b..7017a894c9cb7 100644 --- a/unified-runtime/source/loader/loader.map.in +++ b/unified-runtime/source/loader/loader.map.in @@ -475,6 +475,7 @@ urPrintUsmAdviceFlags; urPrintUsmAllocInfo; urPrintUsmAllocLocationDesc; + urPrintUsmContextMemcpyExpParams; urPrintUsmDesc; urPrintUsmDeviceAllocParams; urPrintUsmDeviceDesc; @@ -550,6 +551,7 @@ urSamplerGetNativeHandle; urSamplerRelease; urSamplerRetain; + urUSMContextMemcpyExp; urUSMDeviceAlloc; urUSMFree; urUSMGetMemAllocInfo; diff --git a/unified-runtime/source/loader/ur_ldrddi.cpp b/unified-runtime/source/loader/ur_ldrddi.cpp index 07d9dcc25c586..19adf2a42efcf 100644 --- a/unified-runtime/source/loader/ur_ldrddi.cpp +++ b/unified-runtime/source/loader/ur_ldrddi.cpp @@ -9536,6 +9536,37 @@ __urdlllocal ur_result_t UR_APICALL urProgramLinkExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urUSMContextMemcpyExp +__urdlllocal ur_result_t UR_APICALL urUSMContextMemcpyExp( + /// [in] Context associated with the device(s) that own the allocations + /// `pSrc` and `pDst`. + ur_context_handle_t hContext, + /// [in] Destination pointer to copy to. + void *pDst, + /// [in] Source pointer to copy from. + const void *pSrc, + /// [in] Size in bytes to be copied. + size_t size) { + ur_result_t result = UR_RESULT_SUCCESS; + + [[maybe_unused]] auto context = getContext(); + + // extract platform's function pointer table + auto dditable = reinterpret_cast(hContext)->dditable; + auto pfnContextMemcpyExp = dditable->ur.USMExp.pfnContextMemcpyExp; + if (nullptr == pfnContextMemcpyExp) + return UR_RESULT_ERROR_UNINITIALIZED; + + // convert loader handle to platform handle + hContext = reinterpret_cast(hContext)->handle; + + // forward to device-platform + result = pfnContextMemcpyExp(hContext, pDst, pSrc, size); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urUSMImportExp __urdlllocal ur_result_t UR_APICALL urUSMImportExp( @@ -10997,6 +11028,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetUSMExpProcAddrTable( pDdiTable->pfnPoolGetDevicePoolExp = ur_loader::urUSMPoolGetDevicePoolExp; pDdiTable->pfnPoolTrimToExp = ur_loader::urUSMPoolTrimToExp; pDdiTable->pfnPitchedAllocExp = ur_loader::urUSMPitchedAllocExp; + pDdiTable->pfnContextMemcpyExp = ur_loader::urUSMContextMemcpyExp; pDdiTable->pfnImportExp = ur_loader::urUSMImportExp; pDdiTable->pfnReleaseExp = ur_loader::urUSMReleaseExp; } else { diff --git a/unified-runtime/source/loader/ur_libapi.cpp b/unified-runtime/source/loader/ur_libapi.cpp index bca1a7abb01f8..d82ba7573de11 100644 --- a/unified-runtime/source/loader/ur_libapi.cpp +++ b/unified-runtime/source/loader/ur_libapi.cpp @@ -869,7 +869,7 @@ ur_result_t UR_APICALL urDeviceGetSelected( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_DEVICE_INFO_ASYNC_USM_ALLOCATIONS_EXP < propName` +/// + `::UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -9892,6 +9892,41 @@ ur_result_t UR_APICALL urProgramLinkExp( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Enable access to peer device memory +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hContext` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pDst` +/// + `NULL == pSrc` +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +ur_result_t UR_APICALL urUSMContextMemcpyExp( + /// [in] Context associated with the device(s) that own the allocations + /// `pSrc` and `pDst`. + ur_context_handle_t hContext, + /// [in] Destination pointer to copy to. + void *pDst, + /// [in] Source pointer to copy from. + const void *pSrc, + /// [in] Size in bytes to be copied. + size_t size) try { + auto pfnContextMemcpyExp = + ur_lib::getContext()->urDdiTable.USMExp.pfnContextMemcpyExp; + if (nullptr == pfnContextMemcpyExp) + return UR_RESULT_ERROR_UNINITIALIZED; + + return pfnContextMemcpyExp(hContext, pDst, pSrc, size); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Import memory into USM /// diff --git a/unified-runtime/source/loader/ur_print.cpp b/unified-runtime/source/loader/ur_print.cpp index 569adc51e6fe1..4be4038176a0d 100644 --- a/unified-runtime/source/loader/ur_print.cpp +++ b/unified-runtime/source/loader/ur_print.cpp @@ -2702,6 +2702,14 @@ ur_result_t urPrintUsmPitchedAllocExpParams( return str_copy(&ss, buffer, buff_size, out_size); } +ur_result_t urPrintUsmContextMemcpyExpParams( + const struct ur_usm_context_memcpy_exp_params_t *params, char *buffer, + const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + ur_result_t urPrintUsmImportExpParams(const struct ur_usm_import_exp_params_t *params, char *buffer, const size_t buff_size, diff --git a/unified-runtime/source/ur_api.cpp b/unified-runtime/source/ur_api.cpp index d2c136f7d7737..870199696b2de 100644 --- a/unified-runtime/source/ur_api.cpp +++ b/unified-runtime/source/ur_api.cpp @@ -783,7 +783,7 @@ ur_result_t UR_APICALL urDeviceGetSelected( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_DEVICE_INFO_ASYNC_USM_ALLOCATIONS_EXP < propName` +/// + `::UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -8628,6 +8628,35 @@ ur_result_t UR_APICALL urProgramLinkExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Enable access to peer device memory +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hContext` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pDst` +/// + `NULL == pSrc` +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +ur_result_t UR_APICALL urUSMContextMemcpyExp( + /// [in] Context associated with the device(s) that own the allocations + /// `pSrc` and `pDst`. + ur_context_handle_t hContext, + /// [in] Destination pointer to copy to. + void *pDst, + /// [in] Source pointer to copy from. + const void *pSrc, + /// [in] Size in bytes to be copied. + size_t size) { + ur_result_t result = UR_RESULT_SUCCESS; + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Import memory into USM /// diff --git a/unified-runtime/tools/urinfo/urinfo.hpp b/unified-runtime/tools/urinfo/urinfo.hpp index 5457b5736c795..851cfcfb3caf3 100644 --- a/unified-runtime/tools/urinfo/urinfo.hpp +++ b/unified-runtime/tools/urinfo/urinfo.hpp @@ -422,5 +422,8 @@ inline void printDeviceInfos(ur_device_handle_t hDevice, hDevice, UR_DEVICE_INFO_2D_BLOCK_ARRAY_CAPABILITIES_EXP); std::cout << prefix; printDeviceInfo(hDevice, UR_DEVICE_INFO_ASYNC_USM_ALLOCATIONS_EXP); + std::cout << prefix; + printDeviceInfo(hDevice, + UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP); } } // namespace urinfo From 6dd837257fca96f0a94923d377222407abb084a5 Mon Sep 17 00:00:00 2001 From: Aaron Greig Date: Fri, 7 Mar 2025 11:45:05 +0000 Subject: [PATCH 02/14] Add cts tests. --- unified-runtime/include/ur_api.h | 9 +- .../scripts/core/exp-usm-context-memcpy.yml | 3 + .../source/adapters/opencl/usm.cpp | 2 +- .../loader/layers/validation/ur_valddi.cpp | 3 + unified-runtime/source/loader/ur_libapi.cpp | 6 +- unified-runtime/source/ur_api.cpp | 6 +- .../test/conformance/CMakeLists.txt | 1 + .../exp_usm_context_memcpy/CMakeLists.txt | 9 + .../urUSMContextMemcpyExp.cpp | 164 ++++++++++++++++++ .../conformance/testing/include/uur/utils.h | 2 + .../test/conformance/testing/source/utils.cpp | 6 + 11 files changed, 206 insertions(+), 5 deletions(-) create mode 100644 unified-runtime/test/conformance/exp_usm_context_memcpy/CMakeLists.txt create mode 100644 unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp diff --git a/unified-runtime/include/ur_api.h b/unified-runtime/include/ur_api.h index ad47e66be1df2..9cf07c59e7a46 100644 --- a/unified-runtime/include/ur_api.h +++ b/unified-runtime/include/ur_api.h @@ -2322,7 +2322,8 @@ typedef enum ur_device_info_t { /// [::ur_bool_t] returns true if the device supports enqueueing of /// allocations and frees. UR_DEVICE_INFO_ASYNC_USM_ALLOCATIONS_EXP = 0x2050, - /// [::ur_bool_t] returns true if the device supports ::urUSMContextMemcpy + /// [::ur_bool_t] returns true if the device supports + /// ::urUSMContextMemcpyExp UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP = 0x7000, /// @cond UR_DEVICE_INFO_FORCE_UINT32 = 0x7fffffff @@ -12165,7 +12166,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramLinkExp( #pragma region usm_context_memcpy_(experimental) #endif /////////////////////////////////////////////////////////////////////////////// -/// @brief Enable access to peer device memory +/// @brief Perform a synchronous, blocking memcpy operation between USM +/// allocations. /// /// @returns /// - ::UR_RESULT_SUCCESS @@ -12179,6 +12181,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramLinkExp( /// + `NULL == pSrc` /// - ::UR_RESULT_SUCCESS /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// + `size == 0` +/// + If `size` is higher than the allocation size of `pSrc` or `pDst` UR_APIEXPORT ur_result_t UR_APICALL urUSMContextMemcpyExp( /// [in] Context associated with the device(s) that own the allocations /// `pSrc` and `pDst`. diff --git a/unified-runtime/scripts/core/exp-usm-context-memcpy.yml b/unified-runtime/scripts/core/exp-usm-context-memcpy.yml index fd80416afa22f..40c4328e28832 100644 --- a/unified-runtime/scripts/core/exp-usm-context-memcpy.yml +++ b/unified-runtime/scripts/core/exp-usm-context-memcpy.yml @@ -43,3 +43,6 @@ params: returns: - $X_RESULT_SUCCESS - $X_RESULT_ERROR_ADAPTER_SPECIFIC + - $X_RESULT_ERROR_INVALID_SIZE: + - "`size == 0`" + - "If `size` is higher than the allocation size of `pSrc` or `pDst`" diff --git a/unified-runtime/source/adapters/opencl/usm.cpp b/unified-runtime/source/adapters/opencl/usm.cpp index f9cda1c1d4691..70195d07fe9bc 100644 --- a/unified-runtime/source/adapters/opencl/usm.cpp +++ b/unified-runtime/source/adapters/opencl/usm.cpp @@ -776,7 +776,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolTrimToExp(ur_context_handle_t, } UR_APIEXPORT ur_result_t UR_APICALL urUSMContextMemcpyExp(ur_context_handle_t, - const void *, void *, + void *, const void *, size_t) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp index aef744339ed09..e8079c2d0fc6e 100644 --- a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp +++ b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp @@ -10059,6 +10059,9 @@ __urdlllocal ur_result_t UR_APICALL urUSMContextMemcpyExp( if (NULL == pSrc) return UR_RESULT_ERROR_INVALID_NULL_POINTER; + + if (size == 0) + return UR_RESULT_ERROR_INVALID_SIZE; } if (getContext()->enableLifetimeValidation && diff --git a/unified-runtime/source/loader/ur_libapi.cpp b/unified-runtime/source/loader/ur_libapi.cpp index d82ba7573de11..08dd8a76b31ba 100644 --- a/unified-runtime/source/loader/ur_libapi.cpp +++ b/unified-runtime/source/loader/ur_libapi.cpp @@ -9893,7 +9893,8 @@ ur_result_t UR_APICALL urProgramLinkExp( } /////////////////////////////////////////////////////////////////////////////// -/// @brief Enable access to peer device memory +/// @brief Perform a synchronous, blocking memcpy operation between USM +/// allocations. /// /// @returns /// - ::UR_RESULT_SUCCESS @@ -9907,6 +9908,9 @@ ur_result_t UR_APICALL urProgramLinkExp( /// + `NULL == pSrc` /// - ::UR_RESULT_SUCCESS /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// + `size == 0` +/// + If `size` is higher than the allocation size of `pSrc` or `pDst` ur_result_t UR_APICALL urUSMContextMemcpyExp( /// [in] Context associated with the device(s) that own the allocations /// `pSrc` and `pDst`. diff --git a/unified-runtime/source/ur_api.cpp b/unified-runtime/source/ur_api.cpp index 870199696b2de..e30b80ffbf61b 100644 --- a/unified-runtime/source/ur_api.cpp +++ b/unified-runtime/source/ur_api.cpp @@ -8629,7 +8629,8 @@ ur_result_t UR_APICALL urProgramLinkExp( } /////////////////////////////////////////////////////////////////////////////// -/// @brief Enable access to peer device memory +/// @brief Perform a synchronous, blocking memcpy operation between USM +/// allocations. /// /// @returns /// - ::UR_RESULT_SUCCESS @@ -8643,6 +8644,9 @@ ur_result_t UR_APICALL urProgramLinkExp( /// + `NULL == pSrc` /// - ::UR_RESULT_SUCCESS /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// + `size == 0` +/// + If `size` is higher than the allocation size of `pSrc` or `pDst` ur_result_t UR_APICALL urUSMContextMemcpyExp( /// [in] Context associated with the device(s) that own the allocations /// `pSrc` and `pDst`. diff --git a/unified-runtime/test/conformance/CMakeLists.txt b/unified-runtime/test/conformance/CMakeLists.txt index 0ae75ad667103..1bfd78abdb1c5 100644 --- a/unified-runtime/test/conformance/CMakeLists.txt +++ b/unified-runtime/test/conformance/CMakeLists.txt @@ -103,6 +103,7 @@ add_subdirectory(event) add_subdirectory(queue) add_subdirectory(sampler) add_subdirectory(virtual_memory) +add_subdirectory(exp_usm_context_memcpy) set(TEST_SUBDIRECTORIES_DPCXX "device_code" diff --git a/unified-runtime/test/conformance/exp_usm_context_memcpy/CMakeLists.txt b/unified-runtime/test/conformance/exp_usm_context_memcpy/CMakeLists.txt new file mode 100644 index 0000000000000..cc43090ee8ed0 --- /dev/null +++ b/unified-runtime/test/conformance/exp_usm_context_memcpy/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (C) 2025 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +add_conformance_test_with_devices_environment( + exp_usm_context_memcpy + urUSMContextMemcpyExp.cpp +) diff --git a/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp b/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp new file mode 100644 index 0000000000000..0e4427a9151e3 --- /dev/null +++ b/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp @@ -0,0 +1,164 @@ +// Copyright (C) 2025 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "uur/utils.h" +#include + +struct urUSMContextMemcpyExpTest : uur::urQueueTest { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(urQueueTest::SetUp()); + + bool context_memcpy_support = false; + ASSERT_SUCCESS( + uur::GetUSMContextMemcpyExpSupport(device, context_memcpy_support)); + if (!context_memcpy_support) { + GTEST_SKIP() << "urUSMContextMemcpyExp is not supported"; + } + } + + void TearDown() override { + if (src_ptr) { + EXPECT_SUCCESS(urUSMFree(context, src_ptr)); + } + if (dst_ptr) { + EXPECT_SUCCESS(urUSMFree(context, dst_ptr)); + } + + UUR_RETURN_ON_FATAL_FAILURE(urQueueTest::TearDown()); + } + + void initAllocations() { + ASSERT_SUCCESS(urEnqueueUSMFill(queue, src_ptr, sizeof(memset_src_value), + &memset_src_value, allocation_size, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urEnqueueUSMFill(queue, dst_ptr, sizeof(memset_dst_value), + &memset_dst_value, allocation_size, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + } + + void verifyData() { + ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, true, &host_mem, dst_ptr, + allocation_size, 0, nullptr, nullptr)); + ASSERT_EQ(host_mem, memset_src_value); + } + + static constexpr size_t memset_src_value = 42; + static constexpr uint8_t memset_dst_value = 0; + static constexpr uint32_t allocation_size = sizeof(memset_src_value); + size_t host_mem = 0; + + void *src_ptr{nullptr}; + void *dst_ptr{nullptr}; +}; + +struct urUSMContextMemcpyExpTestDevice : urUSMContextMemcpyExpTest { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(urUSMContextMemcpyExpTest::SetUp()); + + ur_device_usm_access_capability_flags_t device_usm = 0; + ASSERT_SUCCESS(uur::GetDeviceUSMDeviceSupport(device, device_usm)); + if (!device_usm) { + GTEST_SKIP() << "Device USM is not supported"; + } + + ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr, + allocation_size, + reinterpret_cast(&src_ptr))); + ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr, + allocation_size, + reinterpret_cast(&dst_ptr))); + initAllocations(); + } +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE(urUSMContextMemcpyExpTestDevice); + +TEST_P(urUSMContextMemcpyExpTestDevice, Success) { + ASSERT_SUCCESS( + urUSMContextMemcpyExp(context, dst_ptr, src_ptr, allocation_size)); + verifyData(); +} + +// Arbitrarily do the negative tests with device allocations. These are mostly a +// test of the loader and validation layer anyway so no big deal if they don't +// run on all devices due to lack of support. +TEST_P(urUSMContextMemcpyExpTestDevice, InvalidNullContext) { + ASSERT_EQ_RESULT( + UR_RESULT_ERROR_INVALID_NULL_HANDLE, + urUSMContextMemcpyExp(nullptr, dst_ptr, src_ptr, allocation_size)); +} + +TEST_P(urUSMContextMemcpyExpTestDevice, InvalidNullPtrs) { + ASSERT_EQ_RESULT( + UR_RESULT_ERROR_INVALID_NULL_POINTER, + urUSMContextMemcpyExp(context, nullptr, src_ptr, allocation_size)); + ASSERT_EQ_RESULT( + UR_RESULT_ERROR_INVALID_NULL_POINTER, + urUSMContextMemcpyExp(context, dst_ptr, nullptr, allocation_size)); +} + +TEST_P(urUSMContextMemcpyExpTestDevice, InvalidZeroSize) { + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_SIZE, + urUSMContextMemcpyExp(context, dst_ptr, src_ptr, 0)); +} + +struct urUSMContextMemcpyExpTestHost : urUSMContextMemcpyExpTest { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(urUSMContextMemcpyExpTest::SetUp()); + + ur_device_usm_access_capability_flags_t host_usm = 0; + ASSERT_SUCCESS(uur::GetDeviceUSMHostSupport(device, host_usm)); + if (!host_usm) { + GTEST_SKIP() << "Device USM is not supported"; + } + + ASSERT_SUCCESS(urUSMHostAlloc(context, nullptr, nullptr, allocation_size, + reinterpret_cast(&src_ptr))); + ASSERT_SUCCESS(urUSMHostAlloc(context, nullptr, nullptr, allocation_size, + reinterpret_cast(&dst_ptr))); + initAllocations(); + } +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE(urUSMContextMemcpyExpTestHost); + +TEST_P(urUSMContextMemcpyExpTestHost, Success) { + ASSERT_SUCCESS( + urUSMContextMemcpyExp(context, dst_ptr, src_ptr, allocation_size)); + verifyData(); +} + +struct urUSMContextMemcpyExpTestShared : urUSMContextMemcpyExpTest { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(urUSMContextMemcpyExpTest::SetUp()); + + ur_device_usm_access_capability_flags_t shared_usm_single = 0; + + ASSERT_SUCCESS( + uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_single)); + + if (!shared_usm_single) { + GTEST_SKIP() << "Shared USM is not supported by the device."; + } + + ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, + allocation_size, + reinterpret_cast(&src_ptr))); + ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, + allocation_size, + reinterpret_cast(&dst_ptr))); + initAllocations(); + } +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE(urUSMContextMemcpyExpTestShared); + +TEST_P(urUSMContextMemcpyExpTestShared, Success) { + ASSERT_SUCCESS( + urUSMContextMemcpyExp(context, dst_ptr, src_ptr, allocation_size)); + verifyData(); +} diff --git a/unified-runtime/test/conformance/testing/include/uur/utils.h b/unified-runtime/test/conformance/testing/include/uur/utils.h index 13f9cdc985cb0..7bfa03985acae 100644 --- a/unified-runtime/test/conformance/testing/include/uur/utils.h +++ b/unified-runtime/test/conformance/testing/include/uur/utils.h @@ -412,6 +412,8 @@ ur_result_t GetDeviceHostPipeRWSupported(ur_device_handle_t device, bool &support); ur_result_t GetTimestampRecordingSupport(ur_device_handle_t device, bool &support); +ur_result_t GetUSMContextMemcpyExpSupport(ur_device_handle_t device, + bool &support); ur_device_partition_property_t makePartitionByCountsDesc(uint32_t count); diff --git a/unified-runtime/test/conformance/testing/source/utils.cpp b/unified-runtime/test/conformance/testing/source/utils.cpp index bdebcae5e1707..95fead2a1afdf 100644 --- a/unified-runtime/test/conformance/testing/source/utils.cpp +++ b/unified-runtime/test/conformance/testing/source/utils.cpp @@ -647,6 +647,12 @@ ur_result_t GetTimestampRecordingSupport(ur_device_handle_t device, device, UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP, support); } +ur_result_t GetUSMContextMemcpyExpSupport(ur_device_handle_t device, + bool &support) { + return GetDeviceInfo( + device, UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP, support); +} + ur_device_partition_property_t makePartitionByCountsDesc(uint32_t count) { ur_device_partition_property_t desc; desc.type = UR_DEVICE_PARTITION_BY_COUNTS; From acf53ada5bfd44df74eb1637653f3d38e039f73e Mon Sep 17 00:00:00 2001 From: Aaron Greig Date: Fri, 7 Mar 2025 12:07:58 +0000 Subject: [PATCH 03/14] Fix typo --- .../exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp b/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp index 0e4427a9151e3..3c776cebc06de 100644 --- a/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp +++ b/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp @@ -113,7 +113,7 @@ struct urUSMContextMemcpyExpTestHost : urUSMContextMemcpyExpTest { ur_device_usm_access_capability_flags_t host_usm = 0; ASSERT_SUCCESS(uur::GetDeviceUSMHostSupport(device, host_usm)); if (!host_usm) { - GTEST_SKIP() << "Device USM is not supported"; + GTEST_SKIP() << "Host USM is not supported"; } ASSERT_SUCCESS(urUSMHostAlloc(context, nullptr, nullptr, allocation_size, From 43a3123d10ebb798cf58bfad8ad6d53844ce0e3c Mon Sep 17 00:00:00 2001 From: Aaron Greig Date: Fri, 7 Mar 2025 12:26:56 +0000 Subject: [PATCH 04/14] Add missing newline --- unified-runtime/source/adapters/native_cpu/device.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/unified-runtime/source/adapters/native_cpu/device.cpp b/unified-runtime/source/adapters/native_cpu/device.cpp index 43232f5cbe828..9f2fd0620da16 100644 --- a/unified-runtime/source/adapters/native_cpu/device.cpp +++ b/unified-runtime/source/adapters/native_cpu/device.cpp @@ -451,6 +451,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP: return ReturnValue(false); + default: DIE_NO_IMPLEMENTATION; } From 653c5b3599e1294cf53d870e36e2987d4f8cb841 Mon Sep 17 00:00:00 2001 From: Aaron Greig Date: Fri, 7 Mar 2025 12:48:26 +0000 Subject: [PATCH 05/14] Correct linkage of l0 implementation. --- unified-runtime/source/adapters/level_zero/usm.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/unified-runtime/source/adapters/level_zero/usm.cpp b/unified-runtime/source/adapters/level_zero/usm.cpp index 775f3a0686e03..2fc5c83e5bbe0 100644 --- a/unified-runtime/source/adapters/level_zero/usm.cpp +++ b/unified-runtime/source/adapters/level_zero/usm.cpp @@ -852,8 +852,9 @@ ur_result_t UR_APICALL urUSMPoolTrimToExp(ur_context_handle_t, return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -UR_APIEXPORT ur_result_t UR_APICALL urUSMContextMemcpyExp( - ur_context_handle_t Context, void *pDst, const void *pSrc, size_t Size) { +ur_result_t UR_APICALL urUSMContextMemcpyExp(ur_context_handle_t Context, + void *pDst, const void *pSrc, + size_t Size) { // zeCommandListAppendMemoryCopy must not be called from simultaneous // threads with the same command list handle, so we need exclusive lock. std::scoped_lock Lock(Context->ImmediateCommandListMutex); From 3802c4fe0b36ee3907dc67f7abe2b504182cc683 Mon Sep 17 00:00:00 2001 From: Aaron Greig Date: Thu, 13 Mar 2025 15:12:22 +0000 Subject: [PATCH 06/14] Add missing entry for l0v2. --- unified-runtime/source/adapters/level_zero/v2/api.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/unified-runtime/source/adapters/level_zero/v2/api.cpp b/unified-runtime/source/adapters/level_zero/v2/api.cpp index 8fe4e245d0b71..d2a5c8ac285ad 100644 --- a/unified-runtime/source/adapters/level_zero/v2/api.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/api.cpp @@ -252,4 +252,9 @@ ur_result_t UR_APICALL urUSMPoolTrimToExp(ur_context_handle_t hContext, return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } +ur_result_t UR_APICALL urUSMContextMemcpyExp(ur_context_handle_t, void *, + const void *, size_t) { + logger::error("{} function not implemented!", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} } // namespace ur::level_zero From 1a4e90cbe270f635cfedb5afbff3c312d343b17c Mon Sep 17 00:00:00 2001 From: Aaron Greig Date: Fri, 14 Mar 2025 11:49:14 +0000 Subject: [PATCH 07/14] Use new API to avoid temporary queue in ext_oneapi_get_device_global_address --- .../source/detail/device_global_map_entry.cpp | 42 +++++++++++++++++++ .../source/detail/device_global_map_entry.hpp | 5 +++ sycl/source/detail/kernel_bundle_impl.hpp | 22 ++++++---- sycl/source/detail/memory_manager.cpp | 10 +++++ sycl/source/detail/memory_manager.hpp | 3 ++ 5 files changed, 75 insertions(+), 7 deletions(-) diff --git a/sycl/source/detail/device_global_map_entry.cpp b/sycl/source/detail/device_global_map_entry.cpp index a097c778f034c..00dc0be91f3d6 100644 --- a/sycl/source/detail/device_global_map_entry.cpp +++ b/sycl/source/detail/device_global_map_entry.cpp @@ -90,6 +90,48 @@ DeviceGlobalUSMMem &DeviceGlobalMapEntry::getOrAllocateDeviceGlobalUSM( return NewAlloc; } +DeviceGlobalUSMMem & +DeviceGlobalMapEntry::getOrAllocateDeviceGlobalUSM(const context &Context) { + assert(!MIsDeviceImageScopeDecorated && + "USM allocations should not be acquired for device_global with " + "device_image_scope property."); + const std::shared_ptr &CtxImpl = getSyclObjImpl(Context); + const std::shared_ptr &DevImpl = + getSyclObjImpl(CtxImpl->getDevices().front()); + std::lock_guard Lock(MDeviceToUSMPtrMapMutex); + + auto DGUSMPtr = MDeviceToUSMPtrMap.find({DevImpl.get(), CtxImpl.get()}); + if (DGUSMPtr != MDeviceToUSMPtrMap.end()) + return DGUSMPtr->second; + + void *NewDGUSMPtr = detail::usm::alignedAllocInternal( + 0, MDeviceGlobalTSize, CtxImpl.get(), DevImpl.get(), + sycl::usm::alloc::device); + + auto NewAllocIt = MDeviceToUSMPtrMap.emplace( + std::piecewise_construct, + std::forward_as_tuple(DevImpl.get(), CtxImpl.get()), + std::forward_as_tuple(NewDGUSMPtr)); + assert(NewAllocIt.second && + "USM allocation for device and context already happened."); + DeviceGlobalUSMMem &NewAlloc = NewAllocIt.first->second; + + // C++ guarantees members appear in memory in the order they are declared, + // so since the member variable that contains the initial contents of the + // device_global is right after the usm_ptr member variable we can do + // some pointer arithmetic to memcopy over this value to the usm_ptr. This + // value inside of the device_global will be zero-initialized if it was not + // given a value on construction. + MemoryManager::context_copy_usm( + reinterpret_cast( + reinterpret_cast(MDeviceGlobalPtr) + + sizeof(MDeviceGlobalPtr)), + CtxImpl, MDeviceGlobalTSize, NewAlloc.MPtr); + + CtxImpl->addAssociatedDeviceGlobal(MDeviceGlobalPtr); + return NewAlloc; +} + void DeviceGlobalMapEntry::removeAssociatedResources( const context_impl *CtxImpl) { std::lock_guard Lock{MDeviceToUSMPtrMapMutex}; diff --git a/sycl/source/detail/device_global_map_entry.hpp b/sycl/source/detail/device_global_map_entry.hpp index fc4bae13161ed..ac43d4a724dcd 100644 --- a/sycl/source/detail/device_global_map_entry.hpp +++ b/sycl/source/detail/device_global_map_entry.hpp @@ -112,6 +112,11 @@ struct DeviceGlobalMapEntry { DeviceGlobalUSMMem & getOrAllocateDeviceGlobalUSM(const std::shared_ptr &QueueImpl); + // This overload allows the allocation to be initialized without a queue. The + // UR adapter in use must report true for + // UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP to take advantage of this. + DeviceGlobalUSMMem &getOrAllocateDeviceGlobalUSM(const context &Context); + // Removes resources for device_globals associated with the context. void removeAssociatedResources(const context_impl *CtxImpl); diff --git a/sycl/source/detail/kernel_bundle_impl.hpp b/sycl/source/detail/kernel_bundle_impl.hpp index ffb2273c4a75c..5f515949f0d1e 100644 --- a/sycl/source/detail/kernel_bundle_impl.hpp +++ b/sycl/source/detail/kernel_bundle_impl.hpp @@ -891,13 +891,21 @@ class kernel_bundle_impl { "'device_image_scope' property"); } - // TODO: Add context-only initialization via `urUSMContextMemcpyExp` instead - // of using a throw-away queue. - queue InitQueue{MContext, Dev}; - auto &USMMem = - Entry->getOrAllocateDeviceGlobalUSM(getSyclObjImpl(InitQueue)); - InitQueue.wait_and_throw(); - return USMMem.getPtr(); + auto DeviceImpl = getSyclObjImpl(Dev); + bool SupportContextMemcpy = false; + DeviceImpl->getAdapter()->call( + DeviceImpl->getHandleRef(), + UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP, + sizeof(SupportContextMemcpy), &SupportContextMemcpy, nullptr); + if (SupportContextMemcpy) { + return Entry->getOrAllocateDeviceGlobalUSM(MContext).getPtr(); + } else { + queue InitQueue{MContext, Dev}; + auto &USMMem = + Entry->getOrAllocateDeviceGlobalUSM(getSyclObjImpl(InitQueue)); + InitQueue.wait_and_throw(); + return USMMem.getPtr(); + } } size_t ext_oneapi_get_device_global_size(const std::string &Name) { diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index 261b97966ae40..3a3d14c9556b4 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -937,6 +937,16 @@ void MemoryManager::copy_usm(const void *SrcMem, QueueImplPtr SrcQueue, DepEvents.data(), OutEvent); } +void MemoryManager::context_copy_usm(const void *SrcMem, ContextImplPtr Context, + size_t Len, void *DstMem) { + if (!SrcMem || !DstMem) + throw exception(make_error_code(errc::invalid), + "NULL pointer argument in memory copy operation."); + const AdapterPtr &Adapter = Context->getAdapter(); + Adapter->call(Context->getHandleRef(), + DstMem, SrcMem, Len); +} + void MemoryManager::fill_usm(void *Mem, QueueImplPtr Queue, size_t Length, const std::vector &Pattern, std::vector DepEvents, diff --git a/sycl/source/detail/memory_manager.hpp b/sycl/source/detail/memory_manager.hpp index cc573abc62ddb..636a18f863545 100644 --- a/sycl/source/detail/memory_manager.hpp +++ b/sycl/source/detail/memory_manager.hpp @@ -143,6 +143,9 @@ class MemoryManager { ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl); + static void context_copy_usm(const void *SrcMem, ContextImplPtr Context, + size_t Len, void *DstMem); + static void fill_usm(void *DstMem, QueueImplPtr Queue, size_t Len, const std::vector &Pattern, std::vector DepEvents, From 874df408f113920bf92e71c723d41e98d65fb5bb Mon Sep 17 00:00:00 2001 From: Aaron Greig Date: Fri, 14 Mar 2025 15:46:32 +0000 Subject: [PATCH 08/14] Add l0 v2 implementation. --- .../source/adapters/level_zero/v2/api.cpp | 6 ------ .../source/adapters/level_zero/v2/usm.cpp | 16 ++++++++++++++++ 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/unified-runtime/source/adapters/level_zero/v2/api.cpp b/unified-runtime/source/adapters/level_zero/v2/api.cpp index d2a5c8ac285ad..445e2441f0e47 100644 --- a/unified-runtime/source/adapters/level_zero/v2/api.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/api.cpp @@ -251,10 +251,4 @@ ur_result_t UR_APICALL urUSMPoolTrimToExp(ur_context_handle_t hContext, logger::error("{} function not implemented!", __FUNCTION__); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } - -ur_result_t UR_APICALL urUSMContextMemcpyExp(ur_context_handle_t, void *, - const void *, size_t) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} } // namespace ur::level_zero diff --git a/unified-runtime/source/adapters/level_zero/v2/usm.cpp b/unified-runtime/source/adapters/level_zero/v2/usm.cpp index 1a2ca4e73e37d..a0eee5be8c915 100644 --- a/unified-runtime/source/adapters/level_zero/v2/usm.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/usm.cpp @@ -515,4 +515,20 @@ ur_result_t urUSMReleaseExp(ur_context_handle_t hContext, void *hostPtr) { return UR_RESULT_SUCCESS; } +ur_result_t UR_APICALL urUSMContextMemcpyExp(ur_context_handle_t hContext, + void *pDst, const void *pSrc, + size_t size) { + ur_device_handle_t hDevice = hContext->getDevices()[0]; + auto commandList = hContext->getCommandListCache().getImmediateCommandList( + hDevice->ZeDevice, true, + hDevice + ->QueueGroup[ur_device_handle_t_::queue_group_info_t::type::Compute] + .ZeOrdinal, + true, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, ZE_COMMAND_QUEUE_PRIORITY_NORMAL, + std::nullopt); + ZE2UR_CALL(zeCommandListAppendMemoryCopy, + (commandList.get(), pDst, pSrc, size, nullptr, 0, nullptr)); + return UR_RESULT_SUCCESS; +} + } // namespace ur::level_zero From 2fb988f731abe0dd59b6f7a5f899a51b8a162eea Mon Sep 17 00:00:00 2001 From: Aaron Greig Date: Fri, 14 Mar 2025 15:47:32 +0000 Subject: [PATCH 09/14] Add back deleted newline. --- unified-runtime/source/adapters/level_zero/v2/api.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/unified-runtime/source/adapters/level_zero/v2/api.cpp b/unified-runtime/source/adapters/level_zero/v2/api.cpp index 445e2441f0e47..8fe4e245d0b71 100644 --- a/unified-runtime/source/adapters/level_zero/v2/api.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/api.cpp @@ -251,4 +251,5 @@ ur_result_t UR_APICALL urUSMPoolTrimToExp(ur_context_handle_t hContext, logger::error("{} function not implemented!", __FUNCTION__); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + } // namespace ur::level_zero From ed29b772dd48f68902b9ba0eb02197f707905049 Mon Sep 17 00:00:00 2001 From: Aaron Greig Date: Mon, 17 Mar 2025 14:25:45 +0000 Subject: [PATCH 10/14] Address review feedback. --- sycl/source/detail/kernel_bundle_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/source/detail/kernel_bundle_impl.hpp b/sycl/source/detail/kernel_bundle_impl.hpp index db68802518ef3..dcaa55515254d 100644 --- a/sycl/source/detail/kernel_bundle_impl.hpp +++ b/sycl/source/detail/kernel_bundle_impl.hpp @@ -891,7 +891,7 @@ class kernel_bundle_impl { "'device_image_scope' property"); } - auto DeviceImpl = getSyclObjImpl(Dev); + const auto &DeviceImpl = getSyclObjImpl(Dev); bool SupportContextMemcpy = false; DeviceImpl->getAdapter()->call( DeviceImpl->getHandleRef(), From e0340230dc0af930321a76816db85b188e7926ac Mon Sep 17 00:00:00 2001 From: Aaron Greig Date: Mon, 17 Mar 2025 14:33:18 +0000 Subject: [PATCH 11/14] Fix bad merge. --- unified-runtime/source/adapters/hip/device.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unified-runtime/source/adapters/hip/device.cpp b/unified-runtime/source/adapters/hip/device.cpp index 0768434109269..25f6b7bdf6330 100644 --- a/unified-runtime/source/adapters/hip/device.cpp +++ b/unified-runtime/source/adapters/hip/device.cpp @@ -1076,7 +1076,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(false); case UR_DEVICE_INFO_COMMAND_BUFFER_SUBGRAPH_SUPPORT_EXP: return ReturnValue(true); - case UR_DEVICE_INFO_LOW_POWER_EVENTS_EXP: { + case UR_DEVICE_INFO_LOW_POWER_EVENTS_EXP: return ReturnValue(false); case UR_DEVICE_INFO_USE_NATIVE_ASSERT: return ReturnValue(true); From 27c9cb6b7f1e9efd5eecd6b751d7fc769efdb9b2 Mon Sep 17 00:00:00 2001 From: Aaron Greig Date: Mon, 17 Mar 2025 14:58:06 +0000 Subject: [PATCH 12/14] actually fix hip this time --- unified-runtime/source/adapters/hip/device.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/unified-runtime/source/adapters/hip/device.cpp b/unified-runtime/source/adapters/hip/device.cpp index 25f6b7bdf6330..92b0144db609d 100644 --- a/unified-runtime/source/adapters/hip/device.cpp +++ b/unified-runtime/source/adapters/hip/device.cpp @@ -1070,8 +1070,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, } case UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP: return ReturnValue(false); - case UR_DEVICE_INFO_LOW_POWER_EVENTS_EXP: - return ReturnValue(false); case UR_DEVICE_INFO_USM_CONTEXT_MEMCPY_SUPPORT_EXP: return ReturnValue(false); case UR_DEVICE_INFO_COMMAND_BUFFER_SUBGRAPH_SUPPORT_EXP: From 6c2eb44a2f4e40b14417d39c58a5b4fc99e646c8 Mon Sep 17 00:00:00 2001 From: Aaron Greig Date: Tue, 3 Jun 2025 16:31:28 +0100 Subject: [PATCH 13/14] Correct for some recent updates. --- sycl/source/detail/kernel_bundle_impl.hpp | 2 +- unified-runtime/source/adapters/level_zero/v2/usm.cpp | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/sycl/source/detail/kernel_bundle_impl.hpp b/sycl/source/detail/kernel_bundle_impl.hpp index bb5df0b1f4530..b0747f22c725d 100644 --- a/sycl/source/detail/kernel_bundle_impl.hpp +++ b/sycl/source/detail/kernel_bundle_impl.hpp @@ -676,7 +676,7 @@ class kernel_bundle_impl { } else { queue InitQueue{MContext, Dev}; auto &USMMem = - Entry->getOrAllocateDeviceGlobalUSM(getSyclObjImpl(InitQueue)); + Entry->getOrAllocateDeviceGlobalUSM(*getSyclObjImpl(InitQueue)); InitQueue.wait_and_throw(); return USMMem.getPtr(); } diff --git a/unified-runtime/source/adapters/level_zero/v2/usm.cpp b/unified-runtime/source/adapters/level_zero/v2/usm.cpp index 9b2079dd1373a..59073cfb2db3a 100644 --- a/unified-runtime/source/adapters/level_zero/v2/usm.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/usm.cpp @@ -569,12 +569,13 @@ ur_result_t UR_APICALL urUSMContextMemcpyExp(ur_context_handle_t hContext, void *pDst, const void *pSrc, size_t size) { ur_device_handle_t hDevice = hContext->getDevices()[0]; - auto commandList = hContext->getCommandListCache().getImmediateCommandList( - hDevice->ZeDevice, true, + auto Ordinal = static_cast( hDevice ->QueueGroup[ur_device_handle_t_::queue_group_info_t::type::Compute] - .ZeOrdinal, - true, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, ZE_COMMAND_QUEUE_PRIORITY_NORMAL, + .ZeOrdinal); + auto commandList = hContext->getCommandListCache().getImmediateCommandList( + hDevice->ZeDevice, {true, Ordinal, true}, + ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, ZE_COMMAND_QUEUE_PRIORITY_NORMAL, std::nullopt); ZE2UR_CALL(zeCommandListAppendMemoryCopy, (commandList.get(), pDst, pSrc, size, nullptr, 0, nullptr)); From bb64e3a1c1f9dfa3ee8f99743f07f3387d8499fc Mon Sep 17 00:00:00 2001 From: Aaron Greig Date: Wed, 4 Jun 2025 10:32:00 +0100 Subject: [PATCH 14/14] Fix test cmake. --- .../test/conformance/exp_usm_context_memcpy/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unified-runtime/test/conformance/exp_usm_context_memcpy/CMakeLists.txt b/unified-runtime/test/conformance/exp_usm_context_memcpy/CMakeLists.txt index e11b47e5367af..cd01ed84cef9c 100644 --- a/unified-runtime/test/conformance/exp_usm_context_memcpy/CMakeLists.txt +++ b/unified-runtime/test/conformance/exp_usm_context_memcpy/CMakeLists.txt @@ -3,7 +3,7 @@ # See LICENSE.TXT # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -add_conformance_kernels_test( +add_conformance_devices_test( exp_usm_context_memcpy urUSMContextMemcpyExp.cpp )