From d7a18c187d087a83f6b7caccfca9379225694b06 Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Wed, 22 May 2024 10:51:08 +0100 Subject: [PATCH 01/15] Add experimental interface for native enqueue Add initial API spec for experimental native enqueue. --- include/ur_api.h | 92 ++++++++++- include/ur_ddi.h | 12 ++ include/ur_print.h | 24 +++ include/ur_print.hpp | 156 ++++++++++++++++++ scripts/core/EXP-NATIVE-ENQUEUE.rst | 79 +++++++++ scripts/core/exp-native-enqueue.yml | 101 ++++++++++++ scripts/core/registry.yml | 3 + source/adapters/cuda/CMakeLists.txt | 1 + source/adapters/cuda/device.cpp | 4 + source/adapters/cuda/enqueue_native.cpp | 18 ++ source/adapters/null/ur_nullddi.cpp | 41 +++++ source/loader/layers/tracing/ur_trcddi.cpp | 56 +++++++ source/loader/layers/validation/ur_valddi.cpp | 79 ++++++++- source/loader/loader.def.in | 4 + source/loader/loader.map.in | 4 + source/loader/ur_ldrddi.cpp | 62 +++++++ source/loader/ur_libapi.cpp | 50 +++++- source/loader/ur_print.cpp | 24 +++ source/ur_api.cpp | 42 ++++- .../exp_enqueue_native/urEnqueueNative.cpp | 11 ++ tools/urinfo/urinfo.hpp | 3 + 21 files changed, 862 insertions(+), 4 deletions(-) create mode 100644 scripts/core/EXP-NATIVE-ENQUEUE.rst create mode 100644 scripts/core/exp-native-enqueue.yml create mode 100644 source/adapters/cuda/enqueue_native.cpp create mode 100644 test/conformance/exp_enqueue_native/urEnqueueNative.cpp diff --git a/include/ur_api.h b/include/ur_api.h index bd69372aa7..3c1b8f2052 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -225,6 +225,7 @@ typedef enum ur_function_t { UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE = 225, ///< Enumerator for ::urKernelGetSuggestedLocalWorkSize UR_FUNCTION_BINDLESS_IMAGES_IMPORT_EXTERNAL_MEMORY_EXP = 226, ///< Enumerator for ::urBindlessImagesImportExternalMemoryExp UR_FUNCTION_BINDLESS_IMAGES_IMPORT_EXTERNAL_SEMAPHORE_EXP = 227, ///< Enumerator for ::urBindlessImagesImportExternalSemaphoreExp + UR_FUNCTION_ENQUEUE_NATIVE_COMMAND_EXP = 228, ///< Enumerator for ::urEnqueueNativeCommandExp /// @cond UR_FUNCTION_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -1648,6 +1649,8 @@ typedef enum ur_device_info_t { UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP = 0x2017, ///< [::ur_bool_t] returns true if the device is capable of fetching ///< non-USM backed 3D sampled image data. UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP = 0x2018, ///< [::ur_bool_t] returns true if the device supports timestamp recording + UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP = 0x2020, ///< [::ur_bool_t] returns true if the device supports enqueueing of native + ///< work /// @cond UR_DEVICE_INFO_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -1673,7 +1676,7 @@ typedef enum ur_device_info_t { /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName` +/// + `::UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -5683,6 +5686,7 @@ typedef enum ur_command_t { UR_COMMAND_INTEROP_SEMAPHORE_WAIT_EXP = 0x2000, ///< Event created by ::urBindlessImagesWaitExternalSemaphoreExp UR_COMMAND_INTEROP_SEMAPHORE_SIGNAL_EXP = 0x2001, ///< Event created by ::urBindlessImagesSignalExternalSemaphoreExp UR_COMMAND_TIMESTAMP_RECORDING_EXP = 0x2002, ///< Event created by ::urEnqueueTimestampRecordingExp + UR_COMMAND_ENQUEUE_NATIVE_EXP = 0x2004, ///< Event created by ::urNativeEnqueueExp /// @cond UR_COMMAND_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -9497,6 +9501,78 @@ urUsmP2PPeerAccessGetInfoExp( size_t *pPropSizeRet ///< [out][optional] pointer to the actual size in bytes of the queried propName. ); +#if !defined(__GNUC__) +#pragma endregion +#endif +// Intel 'oneAPI' Unified Runtime Experimental API for enqueuing work through native APIs +#if !defined(__GNUC__) +#pragma region native enqueue(experimental) +#endif +/////////////////////////////////////////////////////////////////////////////// +/// @brief Native enqueue properties +typedef uint32_t ur_exp_enqueue_native_command_flags_t; +typedef enum ur_exp_enqueue_native_command_flag_t { + UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAG_TBD = UR_BIT(0), ///< reserved for future use. + /// @cond + UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAG_FORCE_UINT32 = 0x7fffffff + /// @endcond + +} ur_exp_enqueue_native_command_flag_t; +/// @brief Bit Mask for validating ur_exp_enqueue_native_command_flags_t +#define UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAGS_MASK 0xfffffffe + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Native enqueue properties +typedef struct ur_exp_enqueue_native_command_properties_t { + ur_structure_type_t stype; ///< [in] type of this structure, must be + ///< ::UR_STRUCTURE_TYPE_EXP_ENQUEUE_NATIVE_COMMAND_PROPERTIES + void *pNext; ///< [in,out][optional] pointer to extension-specific structure + ur_exp_enqueue_native_command_flags_t flags; ///< [in] native enqueue flags + +} ur_exp_enqueue_native_command_properties_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function enqueueing work through the native API to be executed +/// immediately. +typedef void (*ur_exp_enqueue_native_command_function_t)( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + void *pUserData ///< [in][out] pointer to data to be passed to callback +); + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Immediately enqueue work through a native backend API +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hQueue` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pfnNativeEnqueue` +/// + `NULL == data` +/// + `NULL == pProperties` +/// + `NULL == phEvent` +/// - ::UR_RESULT_ERROR_INVALID_ENUMERATION +/// + `::UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAGS_MASK & pProperties->flags` +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +UR_APIEXPORT ur_result_t UR_APICALL +urEnqueueNativeCommandExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, ///< [in] function calling the native underlying API, to be executed + ///< immediately. + void *data, ///< [in] data used by pfnNativeEnqueue + const ur_exp_enqueue_native_command_properties_t *pProperties, ///< [in] pointer to the native enqueue properties + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait + ///< events. + ur_event_handle_t *phEvent ///< [in,out] return an event object that identifies the work that has + ///< been enqueued in nativeEnqueueFunc. +); + #if !defined(__GNUC__) #pragma endregion #endif @@ -10916,6 +10992,20 @@ typedef struct ur_enqueue_timestamp_recording_exp_params_t { ur_event_handle_t **pphEvent; } ur_enqueue_timestamp_recording_exp_params_t; +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function parameters for urEnqueueNativeCommandExp +/// @details Each entry is a pointer to the parameter passed to the function; +/// allowing the callback the ability to modify the parameter's value +typedef struct ur_enqueue_native_command_exp_params_t { + ur_queue_handle_t *phQueue; + ur_exp_enqueue_native_command_function_t *ppfnNativeEnqueue; + void **pdata; + const ur_exp_enqueue_native_command_properties_t **ppProperties; + uint32_t *pnumEventsInWaitList; + const ur_event_handle_t **pphEventWaitList; + ur_event_handle_t **pphEvent; +} ur_enqueue_native_command_exp_params_t; + /////////////////////////////////////////////////////////////////////////////// /// @brief Function parameters for urBindlessImagesUnsampledImageHandleDestroyExp /// @details Each entry is a pointer to the parameter passed to the function; diff --git a/include/ur_ddi.h b/include/ur_ddi.h index d429b02d68..ec8a879e0d 100644 --- a/include/ur_ddi.h +++ b/include/ur_ddi.h @@ -1483,12 +1483,24 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueTimestampRecordingExp_t)( const ur_event_handle_t *, ur_event_handle_t *); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function-pointer for urEnqueueNativeCommandExp +typedef ur_result_t(UR_APICALL *ur_pfnEnqueueNativeCommandExp_t)( + ur_queue_handle_t, + ur_exp_enqueue_native_command_function_t, + void *, + const ur_exp_enqueue_native_command_properties_t *, + uint32_t, + const ur_event_handle_t *, + ur_event_handle_t *); + /////////////////////////////////////////////////////////////////////////////// /// @brief Table of EnqueueExp functions pointers typedef struct ur_enqueue_exp_dditable_t { ur_pfnEnqueueKernelLaunchCustomExp_t pfnKernelLaunchCustomExp; ur_pfnEnqueueCooperativeKernelLaunchExp_t pfnCooperativeKernelLaunchExp; ur_pfnEnqueueTimestampRecordingExp_t pfnTimestampRecordingExp; + ur_pfnEnqueueNativeCommandExp_t pfnNativeCommandExp; } ur_enqueue_exp_dditable_t; /////////////////////////////////////////////////////////////////////////////// diff --git a/include/ur_print.h b/include/ur_print.h index b72e939f05..60aa71f03b 100644 --- a/include/ur_print.h +++ b/include/ur_print.h @@ -1042,6 +1042,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintExpLaunchProperty(const struct ur_exp /// - `buff_size < out_size` UR_APIEXPORT ur_result_t UR_APICALL urPrintExpPeerInfo(enum ur_exp_peer_info_t value, char *buffer, const size_t buff_size, size_t *out_size); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_exp_enqueue_native_command_flag_t enum +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintExpEnqueueNativeCommandFlags(enum ur_exp_enqueue_native_command_flag_t value, char *buffer, const size_t buff_size, size_t *out_size); + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_exp_enqueue_native_command_properties_t struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintExpEnqueueNativeCommandProperties(const struct ur_exp_enqueue_native_command_properties_t params, char *buffer, const size_t buff_size, size_t *out_size); + /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_loader_config_create_params_t struct /// @returns @@ -2010,6 +2026,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueCooperativeKernelLaunchExpPara /// - `buff_size < out_size` UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueTimestampRecordingExpParams(const struct ur_enqueue_timestamp_recording_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_enqueue_native_command_exp_params_t struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueNativeCommandExpParams(const struct ur_enqueue_native_command_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); + /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_bindless_images_unsampled_image_handle_destroy_exp_params_t struct /// @returns diff --git a/include/ur_print.hpp b/include/ur_print.hpp index f4e886e36b..e6eea16c04 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -215,6 +215,9 @@ inline ur_result_t printUnion( template <> inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_exp_peer_info_t value, size_t size); +template <> +inline ur_result_t printFlag(std::ostream &os, uint32_t flag); + } // namespace ur::details inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value); @@ -345,6 +348,8 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct inline std::ostream &operator<<(std::ostream &os, enum ur_exp_launch_property_id_t value); inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_exp_launch_property_t params); inline std::ostream &operator<<(std::ostream &os, enum ur_exp_peer_info_t value); +inline std::ostream &operator<<(std::ostream &os, enum ur_exp_enqueue_native_command_flag_t value); +inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_exp_enqueue_native_command_properties_t params); /////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_function_t type @@ -934,6 +939,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_BINDLESS_IMAGES_IMPORT_EXTERNAL_SEMAPHORE_EXP: os << "UR_FUNCTION_BINDLESS_IMAGES_IMPORT_EXTERNAL_SEMAPHORE_EXP"; break; + case UR_FUNCTION_ENQUEUE_NATIVE_COMMAND_EXP: + os << "UR_FUNCTION_ENQUEUE_NATIVE_COMMAND_EXP"; + break; default: os << "unknown enumerator"; break; @@ -2589,6 +2597,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_device_info_t value) { case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: os << "UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP"; break; + case UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP: + os << "UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP"; + break; default: os << "unknown enumerator"; break; @@ -4310,6 +4321,18 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_device_info os << ")"; } break; + case UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP: { + const ur_bool_t *tptr = (const ur_bool_t *)ptr; + if (sizeof(ur_bool_t) > size) { + os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + os << *tptr; + + os << ")"; + } break; default: os << "unknown enumerator"; return UR_RESULT_ERROR_INVALID_ENUMERATION; @@ -8815,6 +8838,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_command_t value) { case UR_COMMAND_TIMESTAMP_RECORDING_EXP: os << "UR_COMMAND_TIMESTAMP_RECORDING_EXP"; break; + case UR_COMMAND_ENQUEUE_NATIVE_EXP: + os << "UR_COMMAND_ENQUEUE_NATIVE_EXP"; + break; default: os << "unknown enumerator"; break; @@ -10023,6 +10049,78 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_exp_peer_in } } // namespace ur::details +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_exp_enqueue_native_command_flag_t type +/// @returns +/// std::ostream & +inline std::ostream &operator<<(std::ostream &os, enum ur_exp_enqueue_native_command_flag_t value) { + switch (value) { + case UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAG_TBD: + os << "UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAG_TBD"; + break; + default: + os << "unknown enumerator"; + break; + } + return os; +} + +namespace ur::details { +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_exp_enqueue_native_command_flag_t flag +template <> +inline ur_result_t printFlag(std::ostream &os, uint32_t flag) { + uint32_t val = flag; + bool first = true; + + if ((val & UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAG_TBD) == (uint32_t)UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAG_TBD) { + val ^= (uint32_t)UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAG_TBD; + if (!first) { + os << " | "; + } else { + first = false; + } + os << UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAG_TBD; + } + if (val != 0) { + std::bitset<32> bits(val); + if (!first) { + os << " | "; + } + os << "unknown bit flags " << bits; + } else if (first) { + os << "0"; + } + return UR_RESULT_SUCCESS; +} +} // namespace ur::details +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_exp_enqueue_native_command_properties_t type +/// @returns +/// std::ostream & +inline std::ostream &operator<<(std::ostream &os, const struct ur_exp_enqueue_native_command_properties_t params) { + os << "(struct ur_exp_enqueue_native_command_properties_t){"; + + os << ".stype = "; + + os << (params.stype); + + os << ", "; + os << ".pNext = "; + + ur::details::printStruct(os, + (params.pNext)); + + os << ", "; + os << ".flags = "; + + ur::details::printFlag(os, + (params.flags)); + + os << "}"; + return os; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_loader_config_create_params_t type /// @returns @@ -14418,6 +14516,61 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct return os; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_enqueue_native_command_exp_params_t type +/// @returns +/// std::ostream & +inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_enqueue_native_command_exp_params_t *params) { + + os << ".hQueue = "; + + ur::details::printPtr(os, + *(params->phQueue)); + + os << ", "; + os << ".pfnNativeEnqueue = "; + + os << reinterpret_cast( + *(params->ppfnNativeEnqueue)); + + os << ", "; + os << ".data = "; + + ur::details::printPtr(os, + *(params->pdata)); + + os << ", "; + os << ".pProperties = "; + + ur::details::printPtr(os, + *(params->ppProperties)); + + os << ", "; + os << ".numEventsInWaitList = "; + + os << *(params->pnumEventsInWaitList); + + os << ", "; + os << ".phEventWaitList = {"; + for (size_t i = 0; *(params->pphEventWaitList) != NULL && i < *params->pnumEventsInWaitList; ++i) { + if (i != 0) { + os << ", "; + } + + ur::details::printPtr(os, + (*(params->pphEventWaitList))[i]); + } + os << "}"; + + os << ", "; + os << ".phEvent = "; + + ur::details::printPtr(os, + *(params->pphEvent)); + + return os; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_bindless_images_unsampled_image_handle_destroy_exp_params_t type /// @returns @@ -17467,6 +17620,9 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, ur_function_ case UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP: { os << (const struct ur_enqueue_timestamp_recording_exp_params_t *)params; } break; + case UR_FUNCTION_ENQUEUE_NATIVE_COMMAND_EXP: { + os << (const struct ur_enqueue_native_command_exp_params_t *)params; + } break; case UR_FUNCTION_BINDLESS_IMAGES_UNSAMPLED_IMAGE_HANDLE_DESTROY_EXP: { os << (const struct ur_bindless_images_unsampled_image_handle_destroy_exp_params_t *)params; } break; diff --git a/scripts/core/EXP-NATIVE-ENQUEUE.rst b/scripts/core/EXP-NATIVE-ENQUEUE.rst new file mode 100644 index 0000000000..ca5c96f72c --- /dev/null +++ b/scripts/core/EXP-NATIVE-ENQUEUE.rst @@ -0,0 +1,79 @@ +<% + OneApi=tags['$OneApi'] + x=tags['$x'] + X=x.upper() +%> + +.. _experimental-enqueue-timestamp-recording: + +================================================================================ +Enqueue Timestamp Recording +================================================================================ + +.. warning:: + + Experimental features: + + * May be replaced, updated, or removed at any time. + * Do not require maintaining API/ABI stability of their own additions over + time. + * Do not require conformance testing of their own additions. + + +Motivation +-------------------------------------------------------------------------------- +Interop is an important use case for many programming APIs. Through +${x}EnqueueNativeCommandExp the user can immediately invoke some native API +calls in a way that the UR is aware of. In doing so, the UR adapter can +integrate its own scheduling of UR commands with native commands. + +In order for UR to guarantee correct synchronization of commands enqueued +within the native API through the function passed to +${x}EnqueueNativeCommandExp, the function argument must only use the native +queue accessed through ${x}QueueGetNativeHandle. Use of a native queue that is +not the native queue returned by ${x}QueueGetNativeHandle results in undefined +behaviour. + +API +-------------------------------------------------------------------------------- + +Enums +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* ${x}_device_info_t + * ${X}_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP +* ${x}_command_t + * ${X}_COMMAND_ENQUEUE_NATIVE_EXP +* ${x}_exp_enqueue_native_command_flags_t + +Types +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +${x}_exp_enqueue_native_command_properties_t +${x}_exp_enqueue_native_command_function_t + +Functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +* ${x}EnqueueNativeCommandExp + +Changelog +-------------------------------------------------------------------------------- + ++-----------+------------------------+ +| Revision | Changes | ++===========+========================+ +| 1.0 | Initial Draft | ++-----------+------------------------+ + + +Support +-------------------------------------------------------------------------------- + +Adapters which support this experimental feature *must* return true for the new +`${X}_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP` device info query. + + +Contributors +-------------------------------------------------------------------------------- + +* Hugh Delaney `hugh.delaney@codeplay.com `_ diff --git a/scripts/core/exp-native-enqueue.yml b/scripts/core/exp-native-enqueue.yml new file mode 100644 index 0000000000..59ee49e183 --- /dev/null +++ b/scripts/core/exp-native-enqueue.yml @@ -0,0 +1,101 @@ +# +# Copyright (C) 2024 Intel Corporation +# +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# See YaML.md for syntax definition +# +--- #-------------------------------------------------------------------------- +type: header +desc: "Intel $OneApi Unified Runtime Experimental API for enqueuing work through native APIs" +ordinal: "100" + +--- #-------------------------------------------------------------------------- +type: enum +extend: true +typed_etors: true +desc: "Extension enums to $x_device_info_t to support native enqueue." +name: $x_device_info_t +etors: + - name: ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP + value: "0x2020" + desc: "[$x_bool_t] returns true if the device supports enqueueing of native work" + +--- #-------------------------------------------------------------------------- +type: enum +extend: true +desc: "Command Type experimental enumerations." +name: $x_command_t +etors: + - name: ENQUEUE_NATIVE_EXP + value: "0x2004" + desc: Event created by $xNativeEnqueueExp + +--- #-------------------------------------------------------------------------- +type: enum +desc: "Native enqueue properties" +name: $x_exp_enqueue_native_command_flags_t +etors: + - name: TBD + desc: "reserved for future use." + +--- #-------------------------------------------------------------------------- +type: struct +desc: "Native enqueue properties" +name: $x_exp_enqueue_native_command_properties_t +base: $x_base_properties_t +members: + - type: $x_exp_enqueue_native_command_flags_t + name: flags + desc: "[in] native enqueue flags" + +--- #-------------------------------------------------------------------------- +type: fptr_typedef +desc: "Function enqueueing work through the native API to be executed immediately." +name: $x_exp_enqueue_native_command_function_t +return: void +params: + - type: $x_queue_handle_t + name: hQueue + desc: "[in] handle of the queue object" + - type: void* + name: pUserData + desc: "[in][out] pointer to data to be passed to callback" + +--- #-------------------------------------------------------------------------- +type: function +desc: "Immediately enqueue work through a native backend API" +class: $xEnqueue +name: NativeCommandExp +params: + - type: $x_queue_handle_t + name: hQueue + desc: "[in] handle of the queue object" + - type: $x_exp_enqueue_native_command_function_t + desc: "[in] function calling the native underlying API, to be executed immediately." + name: pfnNativeEnqueue + - type: void* + name: data + desc: "[in] data used by pfnNativeEnqueue" + - type: const $x_exp_enqueue_native_command_properties_t* + name: pProperties + desc: "[in] pointer to the native enqueue properties" + - type: uint32_t + name: numEventsInWaitList + desc: "[in] size of the event wait list" + - type: const $x_event_handle_t* + name: phEventWaitList + desc: | + [in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. + If nullptr, the numEventsInWaitList must be 0, indicating no wait events. + - type: $x_event_handle_t* + name: phEvent + desc: | + [in,out] return an event object that identifies the work that has + been enqueued in nativeEnqueueFunc. +returns: + - $X_RESULT_ERROR_INVALID_NULL_HANDLE + - $X_RESULT_ERROR_INVALID_NULL_POINTER + - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST diff --git a/scripts/core/registry.yml b/scripts/core/registry.yml index 73f1a107d0..8157bbb08a 100644 --- a/scripts/core/registry.yml +++ b/scripts/core/registry.yml @@ -589,6 +589,9 @@ etors: - name: BINDLESS_IMAGES_IMPORT_EXTERNAL_SEMAPHORE_EXP desc: Enumerator for $xBindlessImagesImportExternalSemaphoreExp value: '227' +- name: ENQUEUE_NATIVE_COMMAND_EXP + desc: Enumerator for $xEnqueueNativeCommandExp + value: '228' --- type: enum desc: Defines structure types diff --git a/source/adapters/cuda/CMakeLists.txt b/source/adapters/cuda/CMakeLists.txt index baa67e5961..b3afb74329 100644 --- a/source/adapters/cuda/CMakeLists.txt +++ b/source/adapters/cuda/CMakeLists.txt @@ -19,6 +19,7 @@ add_ur_adapter(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/device.hpp ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp ${CMAKE_CURRENT_SOURCE_DIR}/enqueue.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_native.cpp ${CMAKE_CURRENT_SOURCE_DIR}/event.hpp ${CMAKE_CURRENT_SOURCE_DIR}/event.cpp ${CMAKE_CURRENT_SOURCE_DIR}/image.hpp diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp index b0e0a8b2d0..bd15a62504 100644 --- a/source/adapters/cuda/device.cpp +++ b/source/adapters/cuda/device.cpp @@ -951,6 +951,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, // CUDA supports recording timestamp events. return ReturnValue(true); } + case UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP: { + // CUDA supports enqueueing native work through the urNativeEnqueueExp + return ReturnValue(true); + } case UR_DEVICE_INFO_DEVICE_ID: { int Value = 0; UR_CHECK_ERROR(cuDeviceGetAttribute( diff --git a/source/adapters/cuda/enqueue_native.cpp b/source/adapters/cuda/enqueue_native.cpp new file mode 100644 index 0000000000..65a7e73caf --- /dev/null +++ b/source/adapters/cuda/enqueue_native.cpp @@ -0,0 +1,18 @@ +//===--------- native_enqueue.cpp - CUDA Adapter --------------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +ur_result_t urNativeEnqueueExp(ur_queue_handle_t, + ur_exp_enqueue_native_command_function_t, void *, + uint32_t, const ur_event_handle_t *, + ur_event_handle_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/source/adapters/null/ur_nullddi.cpp b/source/adapters/null/ur_nullddi.cpp index 4a87ecdfce..c57ddca619 100644 --- a/source/adapters/null/ur_nullddi.cpp +++ b/source/adapters/null/ur_nullddi.cpp @@ -5860,6 +5860,45 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueNativeCommandExp +__urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_exp_enqueue_native_command_function_t + pfnNativeEnqueue, ///< [in] function calling the native underlying API, to be executed + ///< immediately. + void *data, ///< [in] data used by pfnNativeEnqueue + const ur_exp_enqueue_native_command_properties_t + *pProperties, ///< [in] pointer to the native enqueue properties + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait + ///< events. + ur_event_handle_t * + phEvent ///< [in,out] return an event object that identifies the work that has + ///< been enqueued in nativeEnqueueFunc. + ) try { + ur_result_t result = UR_RESULT_SUCCESS; + + // if the driver has created a custom function, then call it instead of using the generic path + auto pfnNativeCommandExp = + d_context.urDdiTable.EnqueueExp.pfnNativeCommandExp; + if (nullptr != pfnNativeCommandExp) { + result = + pfnNativeCommandExp(hQueue, pfnNativeEnqueue, data, pProperties, + numEventsInWaitList, phEventWaitList, phEvent); + } else { + // generic implementation + *phEvent = reinterpret_cast(d_context.get()); + } + + return result; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + } // namespace driver #if defined(__cplusplus) @@ -6216,6 +6255,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnTimestampRecordingExp = driver::urEnqueueTimestampRecordingExp; + pDdiTable->pfnNativeCommandExp = driver::urEnqueueNativeCommandExp; + return result; } catch (...) { return exceptionToResult(std::current_exception()); diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp index 00d25a1d56..74b0344c7d 100644 --- a/source/loader/layers/tracing/ur_trcddi.cpp +++ b/source/loader/layers/tracing/ur_trcddi.cpp @@ -7844,6 +7844,58 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueNativeCommandExp +__urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_exp_enqueue_native_command_function_t + pfnNativeEnqueue, ///< [in] function calling the native underlying API, to be executed + ///< immediately. + void *data, ///< [in] data used by pfnNativeEnqueue + const ur_exp_enqueue_native_command_properties_t + *pProperties, ///< [in] pointer to the native enqueue properties + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait + ///< events. + ur_event_handle_t * + phEvent ///< [in,out] return an event object that identifies the work that has + ///< been enqueued in nativeEnqueueFunc. +) { + auto pfnNativeCommandExp = + context.urDdiTable.EnqueueExp.pfnNativeCommandExp; + + if (nullptr == pfnNativeCommandExp) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + ur_enqueue_native_command_exp_params_t params = { + &hQueue, &pfnNativeEnqueue, &data, + &pProperties, &numEventsInWaitList, &phEventWaitList, + &phEvent}; + uint64_t instance = + context.notify_begin(UR_FUNCTION_ENQUEUE_NATIVE_COMMAND_EXP, + "urEnqueueNativeCommandExp", ¶ms); + + context.logger.info("---> urEnqueueNativeCommandExp"); + + ur_result_t result = + pfnNativeCommandExp(hQueue, pfnNativeEnqueue, data, pProperties, + numEventsInWaitList, phEventWaitList, phEvent); + + context.notify_end(UR_FUNCTION_ENQUEUE_NATIVE_COMMAND_EXP, + "urEnqueueNativeCommandExp", ¶ms, &result, instance); + + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_NATIVE_COMMAND_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Exported function for filling application's Global table /// with current process' addresses @@ -8313,6 +8365,10 @@ __urdlllocal ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnTimestampRecordingExp = ur_tracing_layer::urEnqueueTimestampRecordingExp; + dditable.pfnNativeCommandExp = pDdiTable->pfnNativeCommandExp; + pDdiTable->pfnNativeCommandExp = + ur_tracing_layer::urEnqueueNativeCommandExp; + return result; } /////////////////////////////////////////////////////////////////////////////// diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index 043ad1a634..f678c5d3a3 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -507,7 +507,7 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetInfo( return UR_RESULT_ERROR_INVALID_NULL_POINTER; } - if (UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName) { + if (UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP < propName) { return UR_RESULT_ERROR_INVALID_ENUMERATION; } @@ -9515,6 +9515,79 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueNativeCommandExp +__urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_exp_enqueue_native_command_function_t + pfnNativeEnqueue, ///< [in] function calling the native underlying API, to be executed + ///< immediately. + void *data, ///< [in] data used by pfnNativeEnqueue + const ur_exp_enqueue_native_command_properties_t + *pProperties, ///< [in] pointer to the native enqueue properties + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait + ///< events. + ur_event_handle_t * + phEvent ///< [in,out] return an event object that identifies the work that has + ///< been enqueued in nativeEnqueueFunc. +) { + auto pfnNativeCommandExp = + context.urDdiTable.EnqueueExp.pfnNativeCommandExp; + + if (nullptr == pfnNativeCommandExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + if (context.enableParameterValidation) { + if (NULL == hQueue) { + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + } + + if (NULL == pfnNativeEnqueue) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + + if (NULL == data) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + + if (NULL == pProperties) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + + if (NULL == phEvent) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + + if (UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAGS_MASK & pProperties->flags) { + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + + if (phEventWaitList != NULL && numEventsInWaitList > 0) { + for (uint32_t i = 0; i < numEventsInWaitList; ++i) { + if (phEventWaitList[i] == NULL) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + } + } + } + + if (context.enableLifetimeValidation && + !refCountContext.isReferenceValid(hQueue)) { + refCountContext.logInvalidReference(hQueue); + } + + ur_result_t result = + pfnNativeCommandExp(hQueue, pfnNativeEnqueue, data, pProperties, + numEventsInWaitList, phEventWaitList, phEvent); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Exported function for filling application's Global table /// with current process' addresses @@ -9993,6 +10066,10 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnTimestampRecordingExp = ur_validation_layer::urEnqueueTimestampRecordingExp; + dditable.pfnNativeCommandExp = pDdiTable->pfnNativeCommandExp; + pDdiTable->pfnNativeCommandExp = + ur_validation_layer::urEnqueueNativeCommandExp; + return result; } diff --git a/source/loader/loader.def.in b/source/loader/loader.def.in index 41ce8f369c..45ab3b1caf 100644 --- a/source/loader/loader.def.in +++ b/source/loader/loader.def.in @@ -80,6 +80,7 @@ EXPORTS urEnqueueMemImageRead urEnqueueMemImageWrite urEnqueueMemUnmap + urEnqueueNativeCommandExp urEnqueueReadHostPipe urEnqueueTimestampRecordingExp urEnqueueUSMAdvise @@ -272,6 +273,7 @@ EXPORTS urPrintEnqueueMemImageReadParams urPrintEnqueueMemImageWriteParams urPrintEnqueueMemUnmapParams + urPrintEnqueueNativeCommandExpParams urPrintEnqueueReadHostPipeParams urPrintEnqueueTimestampRecordingExpParams urPrintEnqueueUsmAdviseParams @@ -300,6 +302,8 @@ EXPORTS urPrintExpCommandBufferUpdateMemobjArgDesc urPrintExpCommandBufferUpdatePointerArgDesc urPrintExpCommandBufferUpdateValueArgDesc + urPrintExpEnqueueNativeCommandFlags + urPrintExpEnqueueNativeCommandProperties urPrintExpExternalMemType urPrintExpExternalSemaphoreType urPrintExpFileDescriptor diff --git a/source/loader/loader.map.in b/source/loader/loader.map.in index 0cf180ab65..170365ac4b 100644 --- a/source/loader/loader.map.in +++ b/source/loader/loader.map.in @@ -80,6 +80,7 @@ urEnqueueMemImageRead; urEnqueueMemImageWrite; urEnqueueMemUnmap; + urEnqueueNativeCommandExp; urEnqueueReadHostPipe; urEnqueueTimestampRecordingExp; urEnqueueUSMAdvise; @@ -272,6 +273,7 @@ urPrintEnqueueMemImageReadParams; urPrintEnqueueMemImageWriteParams; urPrintEnqueueMemUnmapParams; + urPrintEnqueueNativeCommandExpParams; urPrintEnqueueReadHostPipeParams; urPrintEnqueueTimestampRecordingExpParams; urPrintEnqueueUsmAdviseParams; @@ -300,6 +302,8 @@ urPrintExpCommandBufferUpdateMemobjArgDesc; urPrintExpCommandBufferUpdatePointerArgDesc; urPrintExpCommandBufferUpdateValueArgDesc; + urPrintExpEnqueueNativeCommandFlags; + urPrintExpEnqueueNativeCommandProperties; urPrintExpExternalMemType; urPrintExpExternalSemaphoreType; urPrintExpFileDescriptor; diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp index d464c8782f..fdce5a40b7 100644 --- a/source/loader/ur_ldrddi.cpp +++ b/source/loader/ur_ldrddi.cpp @@ -8096,6 +8096,66 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueNativeCommandExp +__urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_exp_enqueue_native_command_function_t + pfnNativeEnqueue, ///< [in] function calling the native underlying API, to be executed + ///< immediately. + void *data, ///< [in] data used by pfnNativeEnqueue + const ur_exp_enqueue_native_command_properties_t + *pProperties, ///< [in] pointer to the native enqueue properties + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait + ///< events. + ur_event_handle_t * + phEvent ///< [in,out] return an event object that identifies the work that has + ///< been enqueued in nativeEnqueueFunc. +) { + ur_result_t result = UR_RESULT_SUCCESS; + + // extract platform's function pointer table + auto dditable = reinterpret_cast(hQueue)->dditable; + auto pfnNativeCommandExp = dditable->ur.EnqueueExp.pfnNativeCommandExp; + if (nullptr == pfnNativeCommandExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + // convert loader handle to platform handle + hQueue = reinterpret_cast(hQueue)->handle; + + // convert loader handles to platform handles + auto phEventWaitListLocal = + std::vector(numEventsInWaitList); + for (size_t i = 0; i < numEventsInWaitList; ++i) { + phEventWaitListLocal[i] = + reinterpret_cast(phEventWaitList[i])->handle; + } + + // forward to device-platform + result = pfnNativeCommandExp(hQueue, pfnNativeEnqueue, data, pProperties, + numEventsInWaitList, + phEventWaitListLocal.data(), phEvent); + + if (UR_RESULT_SUCCESS != result) { + return result; + } + + try { + // convert platform handle to loader handle + *phEvent = reinterpret_cast( + ur_event_factory.getInstance(*phEvent, dditable)); + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + return result; +} + } // namespace ur_loader #if defined(__cplusplus) @@ -8535,6 +8595,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( ur_loader::urEnqueueCooperativeKernelLaunchExp; pDdiTable->pfnTimestampRecordingExp = ur_loader::urEnqueueTimestampRecordingExp; + pDdiTable->pfnNativeCommandExp = + ur_loader::urEnqueueNativeCommandExp; } else { // return pointers directly to platform's DDIs *pDdiTable = diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index 1a69f86ccb..b6744eb6ae 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -847,7 +847,7 @@ ur_result_t UR_APICALL urDeviceGetSelected( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName` +/// + `::UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -8918,4 +8918,52 @@ ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Immediately enqueue work through a native backend API +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hQueue` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pfnNativeEnqueue` +/// + `NULL == data` +/// + `NULL == pProperties` +/// + `NULL == phEvent` +/// - ::UR_RESULT_ERROR_INVALID_ENUMERATION +/// + `::UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAGS_MASK & pProperties->flags` +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +ur_result_t UR_APICALL urEnqueueNativeCommandExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_exp_enqueue_native_command_function_t + pfnNativeEnqueue, ///< [in] function calling the native underlying API, to be executed + ///< immediately. + void *data, ///< [in] data used by pfnNativeEnqueue + const ur_exp_enqueue_native_command_properties_t + *pProperties, ///< [in] pointer to the native enqueue properties + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait + ///< events. + ur_event_handle_t * + phEvent ///< [in,out] return an event object that identifies the work that has + ///< been enqueued in nativeEnqueueFunc. + ) try { + auto pfnNativeCommandExp = + ur_lib::context->urDdiTable.EnqueueExp.pfnNativeCommandExp; + if (nullptr == pfnNativeCommandExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + return pfnNativeCommandExp(hQueue, pfnNativeEnqueue, data, pProperties, + numEventsInWaitList, phEventWaitList, phEvent); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + } // extern "C" diff --git a/source/loader/ur_print.cpp b/source/loader/ur_print.cpp index 718a65cf72..3f2d017a89 100644 --- a/source/loader/ur_print.cpp +++ b/source/loader/ur_print.cpp @@ -1052,6 +1052,22 @@ ur_result_t urPrintExpPeerInfo(enum ur_exp_peer_info_t value, char *buffer, return str_copy(&ss, buffer, buff_size, out_size); } +ur_result_t urPrintExpEnqueueNativeCommandFlags( + enum ur_exp_enqueue_native_command_flag_t value, char *buffer, + const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << value; + return str_copy(&ss, buffer, buff_size, out_size); +} + +ur_result_t urPrintExpEnqueueNativeCommandProperties( + const struct ur_exp_enqueue_native_command_properties_t params, + char *buffer, const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + ur_result_t urPrintAdapterGetParams(const struct ur_adapter_get_params_t *params, char *buffer, const size_t buff_size, @@ -1703,6 +1719,14 @@ ur_result_t urPrintEnqueueTimestampRecordingExpParams( return str_copy(&ss, buffer, buff_size, out_size); } +ur_result_t urPrintEnqueueNativeCommandExpParams( + const struct ur_enqueue_native_command_exp_params_t *params, char *buffer, + const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + ur_result_t urPrintEventGetInfoParams(const struct ur_event_get_info_params_t *params, char *buffer, const size_t buff_size, diff --git a/source/ur_api.cpp b/source/ur_api.cpp index 793c9c2f8a..30ae6b490f 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -740,7 +740,7 @@ ur_result_t UR_APICALL urDeviceGetSelected( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName` +/// + `::UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -7551,3 +7551,43 @@ ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( ur_result_t result = UR_RESULT_SUCCESS; return result; } + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Immediately enqueue work through a native backend API +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hQueue` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pfnNativeEnqueue` +/// + `NULL == data` +/// + `NULL == pProperties` +/// + `NULL == phEvent` +/// - ::UR_RESULT_ERROR_INVALID_ENUMERATION +/// + `::UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAGS_MASK & pProperties->flags` +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +ur_result_t UR_APICALL urEnqueueNativeCommandExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_exp_enqueue_native_command_function_t + pfnNativeEnqueue, ///< [in] function calling the native underlying API, to be executed + ///< immediately. + void *data, ///< [in] data used by pfnNativeEnqueue + const ur_exp_enqueue_native_command_properties_t + *pProperties, ///< [in] pointer to the native enqueue properties + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait + ///< events. + ur_event_handle_t * + phEvent ///< [in,out] return an event object that identifies the work that has + ///< been enqueued in nativeEnqueueFunc. +) { + ur_result_t result = UR_RESULT_SUCCESS; + return result; +} diff --git a/test/conformance/exp_enqueue_native/urEnqueueNative.cpp b/test/conformance/exp_enqueue_native/urEnqueueNative.cpp new file mode 100644 index 0000000000..b5f15ddd3e --- /dev/null +++ b/test/conformance/exp_enqueue_native/urEnqueueNative.cpp @@ -0,0 +1,11 @@ +// Copyright (C) 2023 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +// What am I checking? +// +// 1. That I can do a basic thing that in native CUDA or HIP sets some memory +// with cuMemFill or something. diff --git a/tools/urinfo/urinfo.hpp b/tools/urinfo/urinfo.hpp index caf61e86f6..2b1f8c89a7 100644 --- a/tools/urinfo/urinfo.hpp +++ b/tools/urinfo/urinfo.hpp @@ -404,5 +404,8 @@ inline void printDeviceInfos(ur_device_handle_t hDevice, std::cout << prefix; printDeviceInfo(hDevice, UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP); + std::cout << prefix; + printDeviceInfo( + hDevice, UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP); } } // namespace urinfo From 2acb53628a5bd52d9b87d3ad25536c6e8fdc4c5e Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Tue, 28 May 2024 12:31:32 +0100 Subject: [PATCH 02/15] Add basic CUDA impl using ScopedStream Use ScopedStream to return the same stream during the lifetime of the RAII object. This allows us to create events outside a user submitted func, and submit work within the user submitted func, since the stream given to the user from urQueueGetNativeHandle is guaranteed to be the same stream that we record events on. --- source/adapters/cuda/enqueue_native.cpp | 47 +++++++++++++++++++++---- source/adapters/cuda/queue.cpp | 6 ++++ source/adapters/cuda/queue.hpp | 28 +++++++++++++++ 3 files changed, 75 insertions(+), 6 deletions(-) diff --git a/source/adapters/cuda/enqueue_native.cpp b/source/adapters/cuda/enqueue_native.cpp index 65a7e73caf..0e21a67730 100644 --- a/source/adapters/cuda/enqueue_native.cpp +++ b/source/adapters/cuda/enqueue_native.cpp @@ -1,4 +1,4 @@ -//===--------- native_enqueue.cpp - CUDA Adapter --------------------------===// +//===--------- enqueue_native.cpp - CUDA Adapter --------------------------===// // // Copyright (C) 2024 Intel Corporation // @@ -10,9 +10,44 @@ #include -ur_result_t urNativeEnqueueExp(ur_queue_handle_t, - ur_exp_enqueue_native_command_function_t, void *, - uint32_t, const ur_event_handle_t *, - ur_event_handle_t *) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +#include "context.hpp" +#include "event.hpp" +#include "queue.hpp" + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueNativeCommandExp( + ur_queue_handle_t hQueue, + ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, void *data, + const ur_exp_enqueue_native_command_properties_t *, + uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + // TODO: how should mem migration work across a context here? + // Perhaps we will need to add a phMemObjArgs so that we are able to make + // sure memory migration happens across devices in the same context + + try { + ScopedContext ActiveContext(hQueue->getDevice()); + ScopedStream ActiveStream(hQueue, NumEventsInWaitList, phEventWaitList); + std::unique_ptr RetImplEvent{nullptr}; + + if (phEvent) { + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_ENQUEUE_NATIVE_EXP, hQueue, ActiveStream.getStream())); + UR_CHECK_ERROR(RetImplEvent->start()); + } + + pfnNativeEnqueue(hQueue, data); // This is using urQueueGetNativeHandle to + // get the CUDA stream. It must be the + // same stream as is used before and after + if (phEvent) { + UR_CHECK_ERROR(RetImplEvent->record()); + *phEvent = RetImplEvent.release(); + } + + } catch (ur_result_t Err) { + return Err; + } catch (CUresult CuErr) { + return mapErrorUR(CuErr); + } + return UR_RESULT_SUCCESS; } diff --git a/source/adapters/cuda/queue.cpp b/source/adapters/cuda/queue.cpp index b52d21e57c..bd92a01400 100644 --- a/source/adapters/cuda/queue.cpp +++ b/source/adapters/cuda/queue.cpp @@ -33,6 +33,8 @@ void ur_queue_handle_t_::transferStreamWaitForBarrierIfNeeded( } CUstream ur_queue_handle_t_::getNextComputeStream(uint32_t *StreamToken) { + if (getThreadLocalStream() != CUstream{0}) + return getThreadLocalStream(); uint32_t StreamI; uint32_t Token; while (true) { @@ -68,6 +70,8 @@ CUstream ur_queue_handle_t_::getNextComputeStream(uint32_t *StreamToken) { CUstream ur_queue_handle_t_::getNextComputeStream( uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, ur_stream_guard_ &Guard, uint32_t *StreamToken) { + if (getThreadLocalStream() != CUstream{0}) + return getThreadLocalStream(); for (uint32_t i = 0; i < NumEventsInWaitList; i++) { uint32_t Token = EventWaitList[i]->getComputeStreamToken(); if (reinterpret_cast(EventWaitList[i]->getQueue()) == @@ -94,6 +98,8 @@ CUstream ur_queue_handle_t_::getNextComputeStream( } CUstream ur_queue_handle_t_::getNextTransferStream() { + if (getThreadLocalStream() != CUstream{0}) + return getThreadLocalStream(); if (TransferStreams.empty()) { // for example in in-order queue return getNextComputeStream(); } diff --git a/source/adapters/cuda/queue.hpp b/source/adapters/cuda/queue.hpp index 727df68573..f59947b958 100644 --- a/source/adapters/cuda/queue.hpp +++ b/source/adapters/cuda/queue.hpp @@ -101,6 +101,13 @@ struct ur_queue_handle_t_ { const ur_event_handle_t *EventWaitList, ur_stream_guard_ &Guard, uint32_t *StreamToken = nullptr); + + // Thread local stream will be used if ScopedStream is active + static CUstream &getThreadLocalStream() { + static thread_local CUstream stream{0}; + return stream; + } + native_type getNextTransferStream(); native_type get() { return getNextComputeStream(); }; ur_device_handle_t getDevice() const noexcept { return Device; }; @@ -265,3 +272,24 @@ struct ur_queue_handle_t_ { bool backendHasOwnership() const noexcept { return HasOwnership; } }; + +// RAII object to make hQueue stream getter methods all return the same stream +// within the lifetime of this object. +// +// This is useful for urEnqueueNativeCommandExp where we want guarantees that +// the user submitted native calls will be dispatched to a known stream, which +// must be "got" within the user submitted fuction. +class ScopedStream { + ur_queue_handle_t hQueue; + +public: + ScopedStream(ur_queue_handle_t hQueue, uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList) + : hQueue{hQueue} { + ur_stream_guard_ Guard; + hQueue->getThreadLocalStream() = + hQueue->getNextComputeStream(NumEventsInWaitList, EventWaitList, Guard); + } + CUstream getStream() { return hQueue->getThreadLocalStream(); } + ~ScopedStream() { hQueue->getThreadLocalStream() = CUstream{0}; } +}; From 78393469181c758cb38937c5617a50770871012c Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Wed, 29 May 2024 12:09:47 +0100 Subject: [PATCH 03/15] Add entry points for L0, HIP and openCL adapters All entry points unsupported for now. Add Entry Point to UR interface loaders --- source/adapters/cuda/ur_interface_loader.cpp | 1 + source/adapters/hip/CMakeLists.txt | 1 + source/adapters/hip/enqueue_native.cpp | 19 +++++++++++++++++++ source/adapters/hip/ur_interface_loader.cpp | 1 + source/adapters/level_zero/CMakeLists.txt | 1 + source/adapters/level_zero/enqueue_native.cpp | 18 ++++++++++++++++++ .../level_zero/ur_interface_loader.cpp | 1 + source/adapters/opencl/CMakeLists.txt | 1 + source/adapters/opencl/enqueue_native.cpp | 19 +++++++++++++++++++ .../adapters/opencl/ur_interface_loader.cpp | 1 + 10 files changed, 63 insertions(+) create mode 100644 source/adapters/hip/enqueue_native.cpp create mode 100644 source/adapters/level_zero/enqueue_native.cpp create mode 100644 source/adapters/opencl/enqueue_native.cpp diff --git a/source/adapters/cuda/ur_interface_loader.cpp b/source/adapters/cuda/ur_interface_loader.cpp index 2b8b132373..8eab4514ac 100644 --- a/source/adapters/cuda/ur_interface_loader.cpp +++ b/source/adapters/cuda/ur_interface_loader.cpp @@ -411,6 +411,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( urEnqueueCooperativeKernelLaunchExp; pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; pDdiTable->pfnKernelLaunchCustomExp = urEnqueueKernelLaunchCustomExp; + pDdiTable->pfnNativeCommandExp = urEnqueueNativeCommandExp; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/hip/CMakeLists.txt b/source/adapters/hip/CMakeLists.txt index 764cfeedf9..09c60a8e71 100644 --- a/source/adapters/hip/CMakeLists.txt +++ b/source/adapters/hip/CMakeLists.txt @@ -63,6 +63,7 @@ add_ur_adapter(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/device.hpp ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp ${CMAKE_CURRENT_SOURCE_DIR}/enqueue.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_native.cpp ${CMAKE_CURRENT_SOURCE_DIR}/event.hpp ${CMAKE_CURRENT_SOURCE_DIR}/event.cpp ${CMAKE_CURRENT_SOURCE_DIR}/image.cpp diff --git a/source/adapters/hip/enqueue_native.cpp b/source/adapters/hip/enqueue_native.cpp new file mode 100644 index 0000000000..796848b9e4 --- /dev/null +++ b/source/adapters/hip/enqueue_native.cpp @@ -0,0 +1,19 @@ +//===--------- enqueue_native.cpp - HIP Adapter ---------------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +UR_APICALL UR_APIEXPORT ur_result_t urEnqueueNativeCommandExp( + ur_queue_handle_t, ur_exp_enqueue_native_command_function_t, void *, + const ur_exp_enqueue_native_command_properties_t *, + uint32_t, const ur_event_handle_t *, ur_event_handle_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + diff --git a/source/adapters/hip/ur_interface_loader.cpp b/source/adapters/hip/ur_interface_loader.cpp index 7a28623e0b..af9b8fa9c3 100644 --- a/source/adapters/hip/ur_interface_loader.cpp +++ b/source/adapters/hip/ur_interface_loader.cpp @@ -379,6 +379,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCooperativeKernelLaunchExp = urEnqueueCooperativeKernelLaunchExp; pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; + pDdiTable->pfnNativeCommandExp = urEnqueueNativeCommandExp; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt index 27de753eb8..5827452e01 100644 --- a/source/adapters/level_zero/CMakeLists.txt +++ b/source/adapters/level_zero/CMakeLists.txt @@ -116,6 +116,7 @@ add_ur_adapter(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/common.cpp ${CMAKE_CURRENT_SOURCE_DIR}/context.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_native.cpp ${CMAKE_CURRENT_SOURCE_DIR}/event.cpp ${CMAKE_CURRENT_SOURCE_DIR}/usm.cpp ${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp diff --git a/source/adapters/level_zero/enqueue_native.cpp b/source/adapters/level_zero/enqueue_native.cpp new file mode 100644 index 0000000000..706410c2dd --- /dev/null +++ b/source/adapters/level_zero/enqueue_native.cpp @@ -0,0 +1,18 @@ +//===--------- enqueue_native.cpp - LevelZero Adapter ---------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +UR_APICALL UR_APIEXPORT ur_result_t urEnqueueNativeCommandExp( + ur_queue_handle_t, ur_exp_enqueue_native_command_function_t, void *, + const ur_exp_enqueue_native_command_properties_t *, + uint32_t, const ur_event_handle_t *, ur_event_handle_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/source/adapters/level_zero/ur_interface_loader.cpp b/source/adapters/level_zero/ur_interface_loader.cpp index b481ba0c6a..fb03d23c00 100644 --- a/source/adapters/level_zero/ur_interface_loader.cpp +++ b/source/adapters/level_zero/ur_interface_loader.cpp @@ -459,6 +459,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCooperativeKernelLaunchExp = urEnqueueCooperativeKernelLaunchExp; pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; + pDdiTable->pfnNativeCommandExp = urEnqueueNativeCommandExp; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/opencl/CMakeLists.txt b/source/adapters/opencl/CMakeLists.txt index 38f8e9a523..65cbba9df6 100644 --- a/source/adapters/opencl/CMakeLists.txt +++ b/source/adapters/opencl/CMakeLists.txt @@ -25,6 +25,7 @@ add_ur_adapter(${TARGET_NAME} SHARED ${CMAKE_CURRENT_SOURCE_DIR}/device.hpp ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp ${CMAKE_CURRENT_SOURCE_DIR}/enqueue.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_native.cpp ${CMAKE_CURRENT_SOURCE_DIR}/event.cpp ${CMAKE_CURRENT_SOURCE_DIR}/image.cpp ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cpp diff --git a/source/adapters/opencl/enqueue_native.cpp b/source/adapters/opencl/enqueue_native.cpp new file mode 100644 index 0000000000..4691545a16 --- /dev/null +++ b/source/adapters/opencl/enqueue_native.cpp @@ -0,0 +1,19 @@ +//===--------- enqueue_native.cpp - OpenCL Adapter ------------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +UR_APICALL UR_APIEXPORT ur_result_t urEnqueueNativeCommandExp( + ur_queue_handle_t, ur_exp_enqueue_native_command_function_t, void *, + const ur_exp_enqueue_native_command_properties_t *, + uint32_t, const ur_event_handle_t *, ur_event_handle_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + diff --git a/source/adapters/opencl/ur_interface_loader.cpp b/source/adapters/opencl/ur_interface_loader.cpp index 961b04f521..f542cf2192 100644 --- a/source/adapters/opencl/ur_interface_loader.cpp +++ b/source/adapters/opencl/ur_interface_loader.cpp @@ -401,6 +401,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCooperativeKernelLaunchExp = urEnqueueCooperativeKernelLaunchExp; pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; + pDdiTable->pfnNativeCommandExp = urEnqueueNativeCommandExp; return UR_RESULT_SUCCESS; } From 69268a3db98c60f107d7edfbbe034e6fed2f543a Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Thu, 30 May 2024 14:20:07 +0100 Subject: [PATCH 04/15] Add CUDA testing --- test/conformance/CMakeLists.txt | 1 + .../exp_enqueue_native/CMakeLists.txt | 18 ++++ .../enqueue_native_cuda.cpp | 87 +++++++++++++++++++ ...exp_enqueue_native_cuda_adapter_cuda.match | 0 .../exp_enqueue_native/urEnqueueNative.cpp | 11 --- 5 files changed, 106 insertions(+), 11 deletions(-) create mode 100644 test/conformance/exp_enqueue_native/CMakeLists.txt create mode 100644 test/conformance/exp_enqueue_native/enqueue_native_cuda.cpp create mode 100644 test/conformance/exp_enqueue_native/exp_enqueue_native_cuda_adapter_cuda.match delete mode 100644 test/conformance/exp_enqueue_native/urEnqueueNative.cpp diff --git a/test/conformance/CMakeLists.txt b/test/conformance/CMakeLists.txt index 439b9b7a79..a5c1c43379 100644 --- a/test/conformance/CMakeLists.txt +++ b/test/conformance/CMakeLists.txt @@ -142,6 +142,7 @@ if(UR_DPCXX) add_subdirectory(enqueue) add_subdirectory(integration) add_subdirectory(exp_command_buffer) + add_subdirectory(exp_enqueue_native) add_subdirectory(exp_usm_p2p) add_subdirectory(exp_launch_properties) add_subdirectory(memory-migrate) diff --git a/test/conformance/exp_enqueue_native/CMakeLists.txt b/test/conformance/exp_enqueue_native/CMakeLists.txt new file mode 100644 index 0000000000..f65975ea71 --- /dev/null +++ b/test/conformance/exp_enqueue_native/CMakeLists.txt @@ -0,0 +1,18 @@ +# Copyright (C) 2024 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +if (UR_BUILD_ADAPTER_CUDA) + add_conformance_test_with_kernels_environment( + exp_enqueue_native_cuda + enqueue_native_cuda.cpp + ) + target_include_directories(test-exp_enqueue_native_cuda PRIVATE + ${PROJECT_SOURCE_DIR}/source + ${PROJECT_SOURCE_DIR}/source/adapters/cuda + ) + target_link_libraries(test-exp_enqueue_native_cuda PRIVATE cudadrv) +endif() + +# TODO: Add more tests for different triples diff --git a/test/conformance/exp_enqueue_native/enqueue_native_cuda.cpp b/test/conformance/exp_enqueue_native/enqueue_native_cuda.cpp new file mode 100644 index 0000000000..aa91347d92 --- /dev/null +++ b/test/conformance/exp_enqueue_native/enqueue_native_cuda.cpp @@ -0,0 +1,87 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include + +using T = uint32_t; + +struct urCudaEnqueueNativeCommandTest : uur::urQueueTest { + void SetUp() { + UUR_RETURN_ON_FATAL_FAILURE(uur::urQueueTest::SetUp()); + + host_vec = std::vector(global_size, 0); + ASSERT_EQ(host_vec.size(), global_size); + ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr, + allocation_size, &device_ptr)); + ASSERT_NE(device_ptr, nullptr); + } + static constexpr T val = 42; + static constexpr uint32_t global_size = 1e7; + std::vector host_vec; + void *device_ptr = nullptr; + static constexpr size_t allocation_size = sizeof(val) * global_size; +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urCudaEnqueueNativeCommandTest); + +struct InteropData1 { + void *fill_ptr; +}; + +// Fill a device ptr with the pattern val +void interop_func_1(ur_queue_handle_t hQueue, void *data) { + CUstream stream; + ASSERT_SUCCESS( + urQueueGetNativeHandle(hQueue, nullptr, (ur_native_handle_t *)&stream)); + InteropData1 *func_data = reinterpret_cast(data); + + ASSERT_EQ(cuMemsetD32Async((CUdeviceptr)func_data->fill_ptr, + urCudaEnqueueNativeCommandTest::val, + urCudaEnqueueNativeCommandTest::global_size, + stream), + CUDA_SUCCESS); +} + +struct InteropData2 { + void *from, *to; +}; + +// Read from device ptr to host ptr +void interop_func_2(ur_queue_handle_t hQueue, void *data) { + CUstream stream; + ASSERT_SUCCESS( + urQueueGetNativeHandle(hQueue, nullptr, (ur_native_handle_t *)&stream)); + InteropData2 *func_data = reinterpret_cast(data); + + ASSERT_EQ(cuMemcpyDtoHAsync(func_data->to, (CUdeviceptr)func_data->from, + urCudaEnqueueNativeCommandTest::allocation_size, + stream), + CUDA_SUCCESS); +} + +TEST_P(urCudaEnqueueNativeCommandTest, Success) { + InteropData1 data_1{device_ptr}; + ur_event_handle_t event_1; + ASSERT_SUCCESS(urEnqueueNativeCommandExp(queue, &interop_func_1, &data_1, + nullptr, 0, nullptr, &event_1)); +} + +TEST_P(urCudaEnqueueNativeCommandTest, Dependencies) { + ur_event_handle_t event_1, event_2; + + InteropData1 data_1{device_ptr}; + ASSERT_SUCCESS(urEnqueueNativeCommandExp(queue, &interop_func_1, &data_1, + nullptr, 0, nullptr, &event_1)); + + InteropData2 data_2{device_ptr, host_vec.data()}; + ASSERT_SUCCESS(urEnqueueNativeCommandExp(queue, &interop_func_2, &data_2, + nullptr, 1, &event_1, &event_2)); + urQueueFinish(queue); + for (auto &i : host_vec) { + ASSERT_EQ(i, val); + } +} diff --git a/test/conformance/exp_enqueue_native/exp_enqueue_native_cuda_adapter_cuda.match b/test/conformance/exp_enqueue_native/exp_enqueue_native_cuda_adapter_cuda.match new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/conformance/exp_enqueue_native/urEnqueueNative.cpp b/test/conformance/exp_enqueue_native/urEnqueueNative.cpp deleted file mode 100644 index b5f15ddd3e..0000000000 --- a/test/conformance/exp_enqueue_native/urEnqueueNative.cpp +++ /dev/null @@ -1,11 +0,0 @@ -// Copyright (C) 2023 Intel Corporation -// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. -// See LICENSE.TXT -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include - -// What am I checking? -// -// 1. That I can do a basic thing that in native CUDA or HIP sets some memory -// with cuMemFill or something. From 86a2db950813e1135173c9e149813d40a91a3094 Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Thu, 30 May 2024 16:40:06 +0100 Subject: [PATCH 05/15] Make some params optional Allow some params to be nullptr. Also update docs to add an entry in ur_structure_type_t. And update Native CPU DDI table --- include/ur_api.h | 14 ++++++------- include/ur_print.hpp | 8 ++++++++ scripts/core/EXP-NATIVE-ENQUEUE.rst | 5 ++--- scripts/core/exp-native-enqueue.yml | 18 +++++++++++++---- source/adapters/hip/enqueue_native.cpp | 5 ++--- source/adapters/level_zero/enqueue_native.cpp | 4 ++-- .../native_cpu/ur_interface_loader.cpp | 1 + source/adapters/null/ur_nullddi.cpp | 9 ++++----- source/adapters/opencl/enqueue_native.cpp | 5 ++--- source/common/stype_map_helpers.def | 2 ++ source/loader/layers/tracing/ur_trcddi.cpp | 9 ++++----- source/loader/layers/validation/ur_valddi.cpp | 20 ++++++------------- source/loader/ur_ldrddi.cpp | 9 ++++----- source/loader/ur_libapi.cpp | 13 +++++------- source/ur_api.cpp | 13 +++++------- 15 files changed, 67 insertions(+), 68 deletions(-) diff --git a/include/ur_api.h b/include/ur_api.h index 3c1b8f2052..9d46106f88 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -282,6 +282,7 @@ typedef enum ur_structure_type_t { UR_STRUCTURE_TYPE_EXP_WIN32_HANDLE = 0x2004, ///< ::ur_exp_win32_handle_t UR_STRUCTURE_TYPE_EXP_SAMPLER_ADDR_MODES = 0x2005, ///< ::ur_exp_sampler_addr_modes_t UR_STRUCTURE_TYPE_EXP_SAMPLER_CUBEMAP_PROPERTIES = 0x2006, ///< ::ur_exp_sampler_cubemap_properties_t + UR_STRUCTURE_TYPE_EXP_ENQUEUE_NATIVE_COMMAND_PROPERTIES = 0x3000, ///< ::ur_exp_enqueue_native_command_properties_t /// @cond UR_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -5686,7 +5687,7 @@ typedef enum ur_command_t { UR_COMMAND_INTEROP_SEMAPHORE_WAIT_EXP = 0x2000, ///< Event created by ::urBindlessImagesWaitExternalSemaphoreExp UR_COMMAND_INTEROP_SEMAPHORE_SIGNAL_EXP = 0x2001, ///< Event created by ::urBindlessImagesSignalExternalSemaphoreExp UR_COMMAND_TIMESTAMP_RECORDING_EXP = 0x2002, ///< Event created by ::urEnqueueTimestampRecordingExp - UR_COMMAND_ENQUEUE_NATIVE_EXP = 0x2004, ///< Event created by ::urNativeEnqueueExp + UR_COMMAND_ENQUEUE_NATIVE_EXP = 0x2004, ///< Event created by ::urEnqueueNativeCommandExp /// @cond UR_COMMAND_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -9551,24 +9552,21 @@ typedef void (*ur_exp_enqueue_native_command_function_t)( /// + `NULL == hQueue` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == pfnNativeEnqueue` -/// + `NULL == data` -/// + `NULL == pProperties` /// + `NULL == phEvent` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAGS_MASK & pProperties->flags` +/// + `NULL != pProperties && ::UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAGS_MASK & pProperties->flags` /// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST UR_APIEXPORT ur_result_t UR_APICALL urEnqueueNativeCommandExp( ur_queue_handle_t hQueue, ///< [in] handle of the queue object ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, ///< [in] function calling the native underlying API, to be executed ///< immediately. - void *data, ///< [in] data used by pfnNativeEnqueue - const ur_exp_enqueue_native_command_properties_t *pProperties, ///< [in] pointer to the native enqueue properties + void *data, ///< [in][optional] data used by pfnNativeEnqueue + const ur_exp_enqueue_native_command_properties_t *pProperties, ///< [in][optional] pointer to the native enqueue properties uint32_t numEventsInWaitList, ///< [in] size of the event wait list const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait - ///< events. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. ur_event_handle_t *phEvent ///< [in,out] return an event object that identifies the work that has ///< been enqueued in nativeEnqueueFunc. ); diff --git a/include/ur_print.hpp b/include/ur_print.hpp index e6eea16c04..5eed8edf96 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -1095,6 +1095,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_structure_type_t value case UR_STRUCTURE_TYPE_EXP_SAMPLER_CUBEMAP_PROPERTIES: os << "UR_STRUCTURE_TYPE_EXP_SAMPLER_CUBEMAP_PROPERTIES"; break; + case UR_STRUCTURE_TYPE_EXP_ENQUEUE_NATIVE_COMMAND_PROPERTIES: + os << "UR_STRUCTURE_TYPE_EXP_ENQUEUE_NATIVE_COMMAND_PROPERTIES"; + break; default: os << "unknown enumerator"; break; @@ -1346,6 +1349,11 @@ inline ur_result_t printStruct(std::ostream &os, const void *ptr) { const ur_exp_sampler_cubemap_properties_t *pstruct = (const ur_exp_sampler_cubemap_properties_t *)ptr; printPtr(os, pstruct); } break; + + case UR_STRUCTURE_TYPE_EXP_ENQUEUE_NATIVE_COMMAND_PROPERTIES: { + const ur_exp_enqueue_native_command_properties_t *pstruct = (const ur_exp_enqueue_native_command_properties_t *)ptr; + printPtr(os, pstruct); + } break; default: os << "unknown enumerator"; return UR_RESULT_ERROR_INVALID_ENUMERATION; diff --git a/scripts/core/EXP-NATIVE-ENQUEUE.rst b/scripts/core/EXP-NATIVE-ENQUEUE.rst index ca5c96f72c..029cdcad8c 100644 --- a/scripts/core/EXP-NATIVE-ENQUEUE.rst +++ b/scripts/core/EXP-NATIVE-ENQUEUE.rst @@ -4,10 +4,10 @@ X=x.upper() %> -.. _experimental-enqueue-timestamp-recording: +.. _experimental-enqueue-native-command: ================================================================================ -Enqueue Timestamp Recording +Enqueue Native Command ================================================================================ .. warning:: @@ -50,7 +50,6 @@ Types ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ${x}_exp_enqueue_native_command_properties_t -${x}_exp_enqueue_native_command_function_t Functions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/scripts/core/exp-native-enqueue.yml b/scripts/core/exp-native-enqueue.yml index 59ee49e183..41c2de5035 100644 --- a/scripts/core/exp-native-enqueue.yml +++ b/scripts/core/exp-native-enqueue.yml @@ -31,7 +31,7 @@ name: $x_command_t etors: - name: ENQUEUE_NATIVE_EXP value: "0x2004" - desc: Event created by $xNativeEnqueueExp + desc: Event created by $xEnqueueNativeCommandExp --- #-------------------------------------------------------------------------- type: enum @@ -51,6 +51,16 @@ members: name: flags desc: "[in] native enqueue flags" +--- #-------------------------------------------------------------------------- +type: enum +extend: true +desc: "Structure type experimental enumerations" +name: $x_structure_type_t +etors: + - name: EXP_ENQUEUE_NATIVE_COMMAND_PROPERTIES + desc: $x_exp_enqueue_native_command_properties_t + value: "0x3000" + --- #-------------------------------------------------------------------------- type: fptr_typedef desc: "Function enqueueing work through the native API to be executed immediately." @@ -78,10 +88,10 @@ params: name: pfnNativeEnqueue - type: void* name: data - desc: "[in] data used by pfnNativeEnqueue" + desc: "[in][optional] data used by pfnNativeEnqueue" - type: const $x_exp_enqueue_native_command_properties_t* name: pProperties - desc: "[in] pointer to the native enqueue properties" + desc: "[in][optional] pointer to the native enqueue properties" - type: uint32_t name: numEventsInWaitList desc: "[in] size of the event wait list" @@ -89,7 +99,7 @@ params: name: phEventWaitList desc: | [in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. - If nullptr, the numEventsInWaitList must be 0, indicating no wait events. + If nullptr, the numEventsInWaitList must be 0, indicating no wait events. - type: $x_event_handle_t* name: phEvent desc: | diff --git a/source/adapters/hip/enqueue_native.cpp b/source/adapters/hip/enqueue_native.cpp index 796848b9e4..720ab43d3e 100644 --- a/source/adapters/hip/enqueue_native.cpp +++ b/source/adapters/hip/enqueue_native.cpp @@ -12,8 +12,7 @@ UR_APICALL UR_APIEXPORT ur_result_t urEnqueueNativeCommandExp( ur_queue_handle_t, ur_exp_enqueue_native_command_function_t, void *, - const ur_exp_enqueue_native_command_properties_t *, - uint32_t, const ur_event_handle_t *, ur_event_handle_t *) { + const ur_exp_enqueue_native_command_properties_t *, uint32_t, + const ur_event_handle_t *, ur_event_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } - diff --git a/source/adapters/level_zero/enqueue_native.cpp b/source/adapters/level_zero/enqueue_native.cpp index 706410c2dd..d89745343d 100644 --- a/source/adapters/level_zero/enqueue_native.cpp +++ b/source/adapters/level_zero/enqueue_native.cpp @@ -12,7 +12,7 @@ UR_APICALL UR_APIEXPORT ur_result_t urEnqueueNativeCommandExp( ur_queue_handle_t, ur_exp_enqueue_native_command_function_t, void *, - const ur_exp_enqueue_native_command_properties_t *, - uint32_t, const ur_event_handle_t *, ur_event_handle_t *) { + const ur_exp_enqueue_native_command_properties_t *, uint32_t, + const ur_event_handle_t *, ur_event_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/source/adapters/native_cpu/ur_interface_loader.cpp b/source/adapters/native_cpu/ur_interface_loader.cpp index e922dece67..7c8042202f 100644 --- a/source/adapters/native_cpu/ur_interface_loader.cpp +++ b/source/adapters/native_cpu/ur_interface_loader.cpp @@ -395,6 +395,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCooperativeKernelLaunchExp = nullptr; pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; + pDdiTable->pfnNativeCommandExp = urEnqueueNativeCommandExp; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/null/ur_nullddi.cpp b/source/adapters/null/ur_nullddi.cpp index c57ddca619..d0cdc180bb 100644 --- a/source/adapters/null/ur_nullddi.cpp +++ b/source/adapters/null/ur_nullddi.cpp @@ -5867,15 +5867,14 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp( ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, ///< [in] function calling the native underlying API, to be executed ///< immediately. - void *data, ///< [in] data used by pfnNativeEnqueue - const ur_exp_enqueue_native_command_properties_t - *pProperties, ///< [in] pointer to the native enqueue properties + void *data, ///< [in][optional] data used by pfnNativeEnqueue + const ur_exp_enqueue_native_command_properties_t * + pProperties, ///< [in][optional] pointer to the native enqueue properties uint32_t numEventsInWaitList, ///< [in] size of the event wait list const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait - ///< events. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. ur_event_handle_t * phEvent ///< [in,out] return an event object that identifies the work that has ///< been enqueued in nativeEnqueueFunc. diff --git a/source/adapters/opencl/enqueue_native.cpp b/source/adapters/opencl/enqueue_native.cpp index 4691545a16..4c39e0737a 100644 --- a/source/adapters/opencl/enqueue_native.cpp +++ b/source/adapters/opencl/enqueue_native.cpp @@ -12,8 +12,7 @@ UR_APICALL UR_APIEXPORT ur_result_t urEnqueueNativeCommandExp( ur_queue_handle_t, ur_exp_enqueue_native_command_function_t, void *, - const ur_exp_enqueue_native_command_properties_t *, - uint32_t, const ur_event_handle_t *, ur_event_handle_t *) { + const ur_exp_enqueue_native_command_properties_t *, uint32_t, + const ur_event_handle_t *, ur_event_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } - diff --git a/source/common/stype_map_helpers.def b/source/common/stype_map_helpers.def index 0c3e5b1cc1..a6457e8716 100644 --- a/source/common/stype_map_helpers.def +++ b/source/common/stype_map_helpers.def @@ -95,4 +95,6 @@ template <> struct stype_map : stype_map_impl {}; template <> struct stype_map : stype_map_impl {}; +template <> +struct stype_map : stype_map_impl {}; diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp index 74b0344c7d..6401f4cfdd 100644 --- a/source/loader/layers/tracing/ur_trcddi.cpp +++ b/source/loader/layers/tracing/ur_trcddi.cpp @@ -7851,15 +7851,14 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp( ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, ///< [in] function calling the native underlying API, to be executed ///< immediately. - void *data, ///< [in] data used by pfnNativeEnqueue - const ur_exp_enqueue_native_command_properties_t - *pProperties, ///< [in] pointer to the native enqueue properties + void *data, ///< [in][optional] data used by pfnNativeEnqueue + const ur_exp_enqueue_native_command_properties_t * + pProperties, ///< [in][optional] pointer to the native enqueue properties uint32_t numEventsInWaitList, ///< [in] size of the event wait list const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait - ///< events. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. ur_event_handle_t * phEvent ///< [in,out] return an event object that identifies the work that has ///< been enqueued in nativeEnqueueFunc. diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index f678c5d3a3..0c615e35d4 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -9522,15 +9522,14 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp( ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, ///< [in] function calling the native underlying API, to be executed ///< immediately. - void *data, ///< [in] data used by pfnNativeEnqueue - const ur_exp_enqueue_native_command_properties_t - *pProperties, ///< [in] pointer to the native enqueue properties + void *data, ///< [in][optional] data used by pfnNativeEnqueue + const ur_exp_enqueue_native_command_properties_t * + pProperties, ///< [in][optional] pointer to the native enqueue properties uint32_t numEventsInWaitList, ///< [in] size of the event wait list const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait - ///< events. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. ur_event_handle_t * phEvent ///< [in,out] return an event object that identifies the work that has ///< been enqueued in nativeEnqueueFunc. @@ -9551,19 +9550,12 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp( return UR_RESULT_ERROR_INVALID_NULL_POINTER; } - if (NULL == data) { - return UR_RESULT_ERROR_INVALID_NULL_POINTER; - } - - if (NULL == pProperties) { - return UR_RESULT_ERROR_INVALID_NULL_POINTER; - } - if (NULL == phEvent) { return UR_RESULT_ERROR_INVALID_NULL_POINTER; } - if (UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAGS_MASK & pProperties->flags) { + if (NULL != pProperties && + UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAGS_MASK & pProperties->flags) { return UR_RESULT_ERROR_INVALID_ENUMERATION; } diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp index fdce5a40b7..ff440ab855 100644 --- a/source/loader/ur_ldrddi.cpp +++ b/source/loader/ur_ldrddi.cpp @@ -8103,15 +8103,14 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp( ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, ///< [in] function calling the native underlying API, to be executed ///< immediately. - void *data, ///< [in] data used by pfnNativeEnqueue - const ur_exp_enqueue_native_command_properties_t - *pProperties, ///< [in] pointer to the native enqueue properties + void *data, ///< [in][optional] data used by pfnNativeEnqueue + const ur_exp_enqueue_native_command_properties_t * + pProperties, ///< [in][optional] pointer to the native enqueue properties uint32_t numEventsInWaitList, ///< [in] size of the event wait list const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait - ///< events. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. ur_event_handle_t * phEvent ///< [in,out] return an event object that identifies the work that has ///< been enqueued in nativeEnqueueFunc. diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index b6744eb6ae..0ca104962a 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -8930,26 +8930,23 @@ ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( /// + `NULL == hQueue` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == pfnNativeEnqueue` -/// + `NULL == data` -/// + `NULL == pProperties` /// + `NULL == phEvent` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAGS_MASK & pProperties->flags` +/// + `NULL != pProperties && ::UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAGS_MASK & pProperties->flags` /// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST ur_result_t UR_APICALL urEnqueueNativeCommandExp( ur_queue_handle_t hQueue, ///< [in] handle of the queue object ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, ///< [in] function calling the native underlying API, to be executed ///< immediately. - void *data, ///< [in] data used by pfnNativeEnqueue - const ur_exp_enqueue_native_command_properties_t - *pProperties, ///< [in] pointer to the native enqueue properties + void *data, ///< [in][optional] data used by pfnNativeEnqueue + const ur_exp_enqueue_native_command_properties_t * + pProperties, ///< [in][optional] pointer to the native enqueue properties uint32_t numEventsInWaitList, ///< [in] size of the event wait list const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait - ///< events. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. ur_event_handle_t * phEvent ///< [in,out] return an event object that identifies the work that has ///< been enqueued in nativeEnqueueFunc. diff --git a/source/ur_api.cpp b/source/ur_api.cpp index 30ae6b490f..93147f2d1f 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -7564,26 +7564,23 @@ ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( /// + `NULL == hQueue` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == pfnNativeEnqueue` -/// + `NULL == data` -/// + `NULL == pProperties` /// + `NULL == phEvent` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAGS_MASK & pProperties->flags` +/// + `NULL != pProperties && ::UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAGS_MASK & pProperties->flags` /// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST ur_result_t UR_APICALL urEnqueueNativeCommandExp( ur_queue_handle_t hQueue, ///< [in] handle of the queue object ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, ///< [in] function calling the native underlying API, to be executed ///< immediately. - void *data, ///< [in] data used by pfnNativeEnqueue - const ur_exp_enqueue_native_command_properties_t - *pProperties, ///< [in] pointer to the native enqueue properties + void *data, ///< [in][optional] data used by pfnNativeEnqueue + const ur_exp_enqueue_native_command_properties_t * + pProperties, ///< [in][optional] pointer to the native enqueue properties uint32_t numEventsInWaitList, ///< [in] size of the event wait list const ur_event_handle_t * phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of ///< events that must be complete before the kernel execution. - ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait - ///< events. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. ur_event_handle_t * phEvent ///< [in,out] return an event object that identifies the work that has ///< been enqueued in nativeEnqueueFunc. From 68328c4618d0a45279207d20fb3ce2117c063a54 Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Tue, 4 Jun 2024 13:21:54 +0100 Subject: [PATCH 06/15] Add ur_mem_handle_t * arg to entrypoint In order to manage native memory migration across ur_mem_handle_ts, we need to know which ur_mem_handle_ts are wrapped up in the void * function data. --- include/ur_api.h | 7 +++ include/ur_ddi.h | 2 + include/ur_print.hpp | 17 ++++++ scripts/core/EXP-NATIVE-ENQUEUE.rst | 7 +++ scripts/core/exp-native-enqueue.yml | 8 +++ source/adapters/cuda/enqueue_native.cpp | 55 ++++++++++++++++--- source/adapters/level_zero/enqueue_native.cpp | 1 + source/adapters/null/ur_nullddi.cpp | 14 +++-- source/adapters/opencl/enqueue_native.cpp | 1 + source/loader/layers/tracing/ur_trcddi.cpp | 27 ++++++--- source/loader/layers/validation/ur_valddi.cpp | 14 +++-- source/loader/ur_ldrddi.cpp | 21 +++++-- source/loader/ur_libapi.cpp | 13 ++++- source/ur_api.cpp | 8 ++- .../enqueue_native_cuda.cpp | 15 +++-- 15 files changed, 173 insertions(+), 37 deletions(-) diff --git a/include/ur_api.h b/include/ur_api.h index 9d46106f88..84fe704d5f 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -9562,6 +9562,11 @@ urEnqueueNativeCommandExp( ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, ///< [in] function calling the native underlying API, to be executed ///< immediately. void *data, ///< [in][optional] data used by pfnNativeEnqueue + uint32_t numMemsInMemList, ///< [in] size of the mem list + const ur_mem_handle_t *phMemList, ///< [in][optional][range(0, numMemsInMemList)] mems that are used within + ///< pfnNativeEnqueue using ::urMemGetNativeHandle. + ///< If nullptr, the numMemsInMemList must be 0, indicating that no mems + ///< are accessed with ::urMemGetNativeHandle within pfnNativeEnqueue. const ur_exp_enqueue_native_command_properties_t *pProperties, ///< [in][optional] pointer to the native enqueue properties uint32_t numEventsInWaitList, ///< [in] size of the event wait list const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -10998,6 +11003,8 @@ typedef struct ur_enqueue_native_command_exp_params_t { ur_queue_handle_t *phQueue; ur_exp_enqueue_native_command_function_t *ppfnNativeEnqueue; void **pdata; + uint32_t *pnumMemsInMemList; + const ur_mem_handle_t **pphMemList; const ur_exp_enqueue_native_command_properties_t **ppProperties; uint32_t *pnumEventsInWaitList; const ur_event_handle_t **pphEventWaitList; diff --git a/include/ur_ddi.h b/include/ur_ddi.h index ec8a879e0d..26e2d403ac 100644 --- a/include/ur_ddi.h +++ b/include/ur_ddi.h @@ -1489,6 +1489,8 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueNativeCommandExp_t)( ur_queue_handle_t, ur_exp_enqueue_native_command_function_t, void *, + uint32_t, + const ur_mem_handle_t *, const ur_exp_enqueue_native_command_properties_t *, uint32_t, const ur_event_handle_t *, diff --git a/include/ur_print.hpp b/include/ur_print.hpp index 5eed8edf96..5919e57019 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -14547,6 +14547,23 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur::details::printPtr(os, *(params->pdata)); + os << ", "; + os << ".numMemsInMemList = "; + + os << *(params->pnumMemsInMemList); + + os << ", "; + os << ".phMemList = {"; + for (size_t i = 0; *(params->pphMemList) != NULL && i < *params->pnumMemsInMemList; ++i) { + if (i != 0) { + os << ", "; + } + + ur::details::printPtr(os, + (*(params->pphMemList))[i]); + } + os << "}"; + os << ", "; os << ".pProperties = "; diff --git a/scripts/core/EXP-NATIVE-ENQUEUE.rst b/scripts/core/EXP-NATIVE-ENQUEUE.rst index 029cdcad8c..aba8cb6564 100644 --- a/scripts/core/EXP-NATIVE-ENQUEUE.rst +++ b/scripts/core/EXP-NATIVE-ENQUEUE.rst @@ -34,6 +34,13 @@ queue accessed through ${x}QueueGetNativeHandle. Use of a native queue that is not the native queue returned by ${x}QueueGetNativeHandle results in undefined behaviour. +Any args that are needed by the func must be passed through a void* and unpacked +within the func. If ${x}_mem_handle_t arguments are to be used within +pfnNativeEnqueue, they must be accessed using ${x}MemGetNativeHandle. +${x}_mem_handle_t arguments must be packed in the void* argument that will be +used in pfnNativeEnqueue, as well as ${x}EnqueueNativeCommandExp's phMemList +argument. + API -------------------------------------------------------------------------------- diff --git a/scripts/core/exp-native-enqueue.yml b/scripts/core/exp-native-enqueue.yml index 41c2de5035..5ebb9ab846 100644 --- a/scripts/core/exp-native-enqueue.yml +++ b/scripts/core/exp-native-enqueue.yml @@ -89,6 +89,14 @@ params: - type: void* name: data desc: "[in][optional] data used by pfnNativeEnqueue" + - type: uint32_t + name: numMemsInMemList + desc: "[in] size of the mem list" + - type: const $x_mem_handle_t* + name: phMemList + desc: | + [in][optional][range(0, numMemsInMemList)] mems that are used within pfnNativeEnqueue using $xMemGetNativeHandle. + If nullptr, the numMemsInMemList must be 0, indicating that no mems are accessed with $xMemGetNativeHandle within pfnNativeEnqueue. - type: const $x_exp_enqueue_native_command_properties_t* name: pProperties desc: "[in][optional] pointer to the native enqueue properties" diff --git a/source/adapters/cuda/enqueue_native.cpp b/source/adapters/cuda/enqueue_native.cpp index 0e21a67730..dad7297436 100644 --- a/source/adapters/cuda/enqueue_native.cpp +++ b/source/adapters/cuda/enqueue_native.cpp @@ -12,38 +12,79 @@ #include "context.hpp" #include "event.hpp" +#include "memory.hpp" #include "queue.hpp" UR_APIEXPORT ur_result_t UR_APICALL urEnqueueNativeCommandExp( ur_queue_handle_t hQueue, ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, void *data, + uint32_t NumMemsInMemList, const ur_mem_handle_t *phMemList, const ur_exp_enqueue_native_command_properties_t *, uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - // TODO: how should mem migration work across a context here? - // Perhaps we will need to add a phMemObjArgs so that we are able to make - // sure memory migration happens across devices in the same context + + std::vector MemMigrationEvents; + std::vector> MemMigrationLocks; + + // phEventWaitList only contains events that are handed to UR by the SYCL + // runtime. However since UR handles memory dependencies within a context + // we may need to add more events to our dependent events list if the UR + // context contains multiple devices + if (NumMemsInMemList > 0 && hQueue->getContext()->Devices.size() > 1) { + for (auto i = 0u; i < NumMemsInMemList; ++i) { + auto Mem = phMemList[i]; + if (auto MemDepEvent = Mem->LastEventWritingToMemObj; + MemDepEvent && + std::find(MemMigrationEvents.begin(), MemMigrationEvents.end(), + MemDepEvent) == MemMigrationEvents.end()) { + MemMigrationEvents.push_back(MemDepEvent); + MemMigrationLocks.emplace_back( + std::pair{Mem, ur_lock{Mem->MemoryMigrationMutex}}); + } + } + } try { ScopedContext ActiveContext(hQueue->getDevice()); ScopedStream ActiveStream(hQueue, NumEventsInWaitList, phEventWaitList); std::unique_ptr RetImplEvent{nullptr}; - if (phEvent) { + if (phEvent || MemMigrationEvents.size()) { RetImplEvent = std::unique_ptr(ur_event_handle_t_::makeNative( UR_COMMAND_ENQUEUE_NATIVE_EXP, hQueue, ActiveStream.getStream())); UR_CHECK_ERROR(RetImplEvent->start()); } + if (MemMigrationEvents.size()) { + UR_CHECK_ERROR( + urEnqueueEventsWaitWithBarrier(hQueue, MemMigrationEvents.size(), + MemMigrationEvents.data(), nullptr)); + for (auto i = 0u; i < NumMemsInMemList; ++i) { + auto Mem = phMemList[i]; + migrateMemoryToDeviceIfNeeded(Mem, hQueue->getDevice()); + Mem->setLastEventWritingToMemObj(RetImplEvent.get()); + } + MemMigrationLocks.clear(); + } + pfnNativeEnqueue(hQueue, data); // This is using urQueueGetNativeHandle to // get the CUDA stream. It must be the // same stream as is used before and after - if (phEvent) { + + if (phEvent || MemMigrationEvents.size()) { UR_CHECK_ERROR(RetImplEvent->record()); - *phEvent = RetImplEvent.release(); + if (phEvent) { + *phEvent = RetImplEvent.release(); + } else { + // Give ownership of the event to the mem + for (auto i = 0u; i < NumMemsInMemList; ++i) { + auto Mem = phMemList[i]; + migrateMemoryToDeviceIfNeeded(Mem, hQueue->getDevice()); + Mem->setLastEventWritingToMemObj(RetImplEvent.release()); + } + } } - } catch (ur_result_t Err) { return Err; } catch (CUresult CuErr) { diff --git a/source/adapters/level_zero/enqueue_native.cpp b/source/adapters/level_zero/enqueue_native.cpp index d89745343d..5572bc5bff 100644 --- a/source/adapters/level_zero/enqueue_native.cpp +++ b/source/adapters/level_zero/enqueue_native.cpp @@ -12,6 +12,7 @@ UR_APICALL UR_APIEXPORT ur_result_t urEnqueueNativeCommandExp( ur_queue_handle_t, ur_exp_enqueue_native_command_function_t, void *, + uint32_t, const ur_mem_handle_t *, const ur_exp_enqueue_native_command_properties_t *, uint32_t, const ur_event_handle_t *, ur_event_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; diff --git a/source/adapters/null/ur_nullddi.cpp b/source/adapters/null/ur_nullddi.cpp index d0cdc180bb..e726061ee4 100644 --- a/source/adapters/null/ur_nullddi.cpp +++ b/source/adapters/null/ur_nullddi.cpp @@ -5867,7 +5867,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp( ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, ///< [in] function calling the native underlying API, to be executed ///< immediately. - void *data, ///< [in][optional] data used by pfnNativeEnqueue + void *data, ///< [in][optional] data used by pfnNativeEnqueue + uint32_t numMemsInMemList, ///< [in] size of the mem list + const ur_mem_handle_t * + phMemList, ///< [in][optional][range(0, numMemsInMemList)] mems that are used within + ///< pfnNativeEnqueue using ::urMemGetNativeHandle. + ///< If nullptr, the numMemsInMemList must be 0, indicating that no mems + ///< are accessed with ::urMemGetNativeHandle within pfnNativeEnqueue. const ur_exp_enqueue_native_command_properties_t * pProperties, ///< [in][optional] pointer to the native enqueue properties uint32_t numEventsInWaitList, ///< [in] size of the event wait list @@ -5885,9 +5891,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp( auto pfnNativeCommandExp = d_context.urDdiTable.EnqueueExp.pfnNativeCommandExp; if (nullptr != pfnNativeCommandExp) { - result = - pfnNativeCommandExp(hQueue, pfnNativeEnqueue, data, pProperties, - numEventsInWaitList, phEventWaitList, phEvent); + result = pfnNativeCommandExp( + hQueue, pfnNativeEnqueue, data, numMemsInMemList, phMemList, + pProperties, numEventsInWaitList, phEventWaitList, phEvent); } else { // generic implementation *phEvent = reinterpret_cast(d_context.get()); diff --git a/source/adapters/opencl/enqueue_native.cpp b/source/adapters/opencl/enqueue_native.cpp index 4c39e0737a..75beba8cbb 100644 --- a/source/adapters/opencl/enqueue_native.cpp +++ b/source/adapters/opencl/enqueue_native.cpp @@ -12,6 +12,7 @@ UR_APICALL UR_APIEXPORT ur_result_t urEnqueueNativeCommandExp( ur_queue_handle_t, ur_exp_enqueue_native_command_function_t, void *, + uint32_t, const ur_mem_handle_t *, const ur_exp_enqueue_native_command_properties_t *, uint32_t, const ur_event_handle_t *, ur_event_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp index 6401f4cfdd..bc140f17c2 100644 --- a/source/loader/layers/tracing/ur_trcddi.cpp +++ b/source/loader/layers/tracing/ur_trcddi.cpp @@ -7851,7 +7851,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp( ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, ///< [in] function calling the native underlying API, to be executed ///< immediately. - void *data, ///< [in][optional] data used by pfnNativeEnqueue + void *data, ///< [in][optional] data used by pfnNativeEnqueue + uint32_t numMemsInMemList, ///< [in] size of the mem list + const ur_mem_handle_t * + phMemList, ///< [in][optional][range(0, numMemsInMemList)] mems that are used within + ///< pfnNativeEnqueue using ::urMemGetNativeHandle. + ///< If nullptr, the numMemsInMemList must be 0, indicating that no mems + ///< are accessed with ::urMemGetNativeHandle within pfnNativeEnqueue. const ur_exp_enqueue_native_command_properties_t * pProperties, ///< [in][optional] pointer to the native enqueue properties uint32_t numEventsInWaitList, ///< [in] size of the event wait list @@ -7870,19 +7876,24 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } - ur_enqueue_native_command_exp_params_t params = { - &hQueue, &pfnNativeEnqueue, &data, - &pProperties, &numEventsInWaitList, &phEventWaitList, - &phEvent}; + ur_enqueue_native_command_exp_params_t params = {&hQueue, + &pfnNativeEnqueue, + &data, + &numMemsInMemList, + &phMemList, + &pProperties, + &numEventsInWaitList, + &phEventWaitList, + &phEvent}; uint64_t instance = context.notify_begin(UR_FUNCTION_ENQUEUE_NATIVE_COMMAND_EXP, "urEnqueueNativeCommandExp", ¶ms); context.logger.info("---> urEnqueueNativeCommandExp"); - ur_result_t result = - pfnNativeCommandExp(hQueue, pfnNativeEnqueue, data, pProperties, - numEventsInWaitList, phEventWaitList, phEvent); + ur_result_t result = pfnNativeCommandExp( + hQueue, pfnNativeEnqueue, data, numMemsInMemList, phMemList, + pProperties, numEventsInWaitList, phEventWaitList, phEvent); context.notify_end(UR_FUNCTION_ENQUEUE_NATIVE_COMMAND_EXP, "urEnqueueNativeCommandExp", ¶ms, &result, instance); diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index 0c615e35d4..ce1374f0c6 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -9522,7 +9522,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp( ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, ///< [in] function calling the native underlying API, to be executed ///< immediately. - void *data, ///< [in][optional] data used by pfnNativeEnqueue + void *data, ///< [in][optional] data used by pfnNativeEnqueue + uint32_t numMemsInMemList, ///< [in] size of the mem list + const ur_mem_handle_t * + phMemList, ///< [in][optional][range(0, numMemsInMemList)] mems that are used within + ///< pfnNativeEnqueue using ::urMemGetNativeHandle. + ///< If nullptr, the numMemsInMemList must be 0, indicating that no mems + ///< are accessed with ::urMemGetNativeHandle within pfnNativeEnqueue. const ur_exp_enqueue_native_command_properties_t * pProperties, ///< [in][optional] pointer to the native enqueue properties uint32_t numEventsInWaitList, ///< [in] size of the event wait list @@ -9573,9 +9579,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp( refCountContext.logInvalidReference(hQueue); } - ur_result_t result = - pfnNativeCommandExp(hQueue, pfnNativeEnqueue, data, pProperties, - numEventsInWaitList, phEventWaitList, phEvent); + ur_result_t result = pfnNativeCommandExp( + hQueue, pfnNativeEnqueue, data, numMemsInMemList, phMemList, + pProperties, numEventsInWaitList, phEventWaitList, phEvent); return result; } diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp index ff440ab855..fd18dd4361 100644 --- a/source/loader/ur_ldrddi.cpp +++ b/source/loader/ur_ldrddi.cpp @@ -8103,7 +8103,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp( ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, ///< [in] function calling the native underlying API, to be executed ///< immediately. - void *data, ///< [in][optional] data used by pfnNativeEnqueue + void *data, ///< [in][optional] data used by pfnNativeEnqueue + uint32_t numMemsInMemList, ///< [in] size of the mem list + const ur_mem_handle_t * + phMemList, ///< [in][optional][range(0, numMemsInMemList)] mems that are used within + ///< pfnNativeEnqueue using ::urMemGetNativeHandle. + ///< If nullptr, the numMemsInMemList must be 0, indicating that no mems + ///< are accessed with ::urMemGetNativeHandle within pfnNativeEnqueue. const ur_exp_enqueue_native_command_properties_t * pProperties, ///< [in][optional] pointer to the native enqueue properties uint32_t numEventsInWaitList, ///< [in] size of the event wait list @@ -8127,6 +8133,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp( // convert loader handle to platform handle hQueue = reinterpret_cast(hQueue)->handle; + // convert loader handles to platform handles + auto phMemListLocal = std::vector(numMemsInMemList); + for (size_t i = 0; i < numMemsInMemList; ++i) { + phMemListLocal[i] = + reinterpret_cast(phMemList[i])->handle; + } + // convert loader handles to platform handles auto phEventWaitListLocal = std::vector(numEventsInWaitList); @@ -8136,9 +8149,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp( } // forward to device-platform - result = pfnNativeCommandExp(hQueue, pfnNativeEnqueue, data, pProperties, - numEventsInWaitList, - phEventWaitListLocal.data(), phEvent); + result = pfnNativeCommandExp( + hQueue, pfnNativeEnqueue, data, numMemsInMemList, phMemListLocal.data(), + pProperties, numEventsInWaitList, phEventWaitListLocal.data(), phEvent); if (UR_RESULT_SUCCESS != result) { return result; diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index 0ca104962a..1c5e288a03 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -8939,7 +8939,13 @@ ur_result_t UR_APICALL urEnqueueNativeCommandExp( ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, ///< [in] function calling the native underlying API, to be executed ///< immediately. - void *data, ///< [in][optional] data used by pfnNativeEnqueue + void *data, ///< [in][optional] data used by pfnNativeEnqueue + uint32_t numMemsInMemList, ///< [in] size of the mem list + const ur_mem_handle_t * + phMemList, ///< [in][optional][range(0, numMemsInMemList)] mems that are used within + ///< pfnNativeEnqueue using ::urMemGetNativeHandle. + ///< If nullptr, the numMemsInMemList must be 0, indicating that no mems + ///< are accessed with ::urMemGetNativeHandle within pfnNativeEnqueue. const ur_exp_enqueue_native_command_properties_t * pProperties, ///< [in][optional] pointer to the native enqueue properties uint32_t numEventsInWaitList, ///< [in] size of the event wait list @@ -8957,8 +8963,9 @@ ur_result_t UR_APICALL urEnqueueNativeCommandExp( return UR_RESULT_ERROR_UNINITIALIZED; } - return pfnNativeCommandExp(hQueue, pfnNativeEnqueue, data, pProperties, - numEventsInWaitList, phEventWaitList, phEvent); + return pfnNativeCommandExp(hQueue, pfnNativeEnqueue, data, numMemsInMemList, + phMemList, pProperties, numEventsInWaitList, + phEventWaitList, phEvent); } catch (...) { return exceptionToResult(std::current_exception()); } diff --git a/source/ur_api.cpp b/source/ur_api.cpp index 93147f2d1f..f33de6539b 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -7573,7 +7573,13 @@ ur_result_t UR_APICALL urEnqueueNativeCommandExp( ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, ///< [in] function calling the native underlying API, to be executed ///< immediately. - void *data, ///< [in][optional] data used by pfnNativeEnqueue + void *data, ///< [in][optional] data used by pfnNativeEnqueue + uint32_t numMemsInMemList, ///< [in] size of the mem list + const ur_mem_handle_t * + phMemList, ///< [in][optional][range(0, numMemsInMemList)] mems that are used within + ///< pfnNativeEnqueue using ::urMemGetNativeHandle. + ///< If nullptr, the numMemsInMemList must be 0, indicating that no mems + ///< are accessed with ::urMemGetNativeHandle within pfnNativeEnqueue. const ur_exp_enqueue_native_command_properties_t * pProperties, ///< [in][optional] pointer to the native enqueue properties uint32_t numEventsInWaitList, ///< [in] size of the event wait list diff --git a/test/conformance/exp_enqueue_native/enqueue_native_cuda.cpp b/test/conformance/exp_enqueue_native/enqueue_native_cuda.cpp index aa91347d92..e7870789d9 100644 --- a/test/conformance/exp_enqueue_native/enqueue_native_cuda.cpp +++ b/test/conformance/exp_enqueue_native/enqueue_native_cuda.cpp @@ -66,20 +66,23 @@ void interop_func_2(ur_queue_handle_t hQueue, void *data) { TEST_P(urCudaEnqueueNativeCommandTest, Success) { InteropData1 data_1{device_ptr}; ur_event_handle_t event_1; - ASSERT_SUCCESS(urEnqueueNativeCommandExp(queue, &interop_func_1, &data_1, - nullptr, 0, nullptr, &event_1)); + ASSERT_SUCCESS(urEnqueueNativeCommandExp( + queue, &interop_func_1, &data_1, 0, nullptr /*phMemList=*/, + nullptr /*pProperties=*/, 0, nullptr /*phEventWaitList=*/, &event_1)); } TEST_P(urCudaEnqueueNativeCommandTest, Dependencies) { ur_event_handle_t event_1, event_2; InteropData1 data_1{device_ptr}; - ASSERT_SUCCESS(urEnqueueNativeCommandExp(queue, &interop_func_1, &data_1, - nullptr, 0, nullptr, &event_1)); + ASSERT_SUCCESS(urEnqueueNativeCommandExp( + queue, &interop_func_1, &data_1, 0, nullptr /*phMemList=*/, + nullptr /*pProperties=*/, 0, nullptr /*phEventWaitList=*/, &event_1)); InteropData2 data_2{device_ptr, host_vec.data()}; - ASSERT_SUCCESS(urEnqueueNativeCommandExp(queue, &interop_func_2, &data_2, - nullptr, 1, &event_1, &event_2)); + ASSERT_SUCCESS(urEnqueueNativeCommandExp( + queue, &interop_func_2, &data_2, 0, nullptr /*phMemList=*/, + nullptr /*pProperties=*/, 1, &event_1, &event_2)); urQueueFinish(queue); for (auto &i : host_vec) { ASSERT_EQ(i, val); From af586ecf630f5c3f8f418896965644f5bbaf06f3 Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Tue, 11 Jun 2024 10:08:52 +0100 Subject: [PATCH 07/15] Remove some multi dev ctx shenanigans As is done in #1711, we don't need to wait on events that are not given directly to UR by the user through the phEventWaitList param. --- source/adapters/cuda/enqueue_native.cpp | 49 ++++--------------------- 1 file changed, 8 insertions(+), 41 deletions(-) diff --git a/source/adapters/cuda/enqueue_native.cpp b/source/adapters/cuda/enqueue_native.cpp index dad7297436..f9228ec811 100644 --- a/source/adapters/cuda/enqueue_native.cpp +++ b/source/adapters/cuda/enqueue_native.cpp @@ -23,67 +23,34 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueNativeCommandExp( uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - std::vector MemMigrationEvents; - std::vector> MemMigrationLocks; - - // phEventWaitList only contains events that are handed to UR by the SYCL - // runtime. However since UR handles memory dependencies within a context - // we may need to add more events to our dependent events list if the UR - // context contains multiple devices - if (NumMemsInMemList > 0 && hQueue->getContext()->Devices.size() > 1) { - for (auto i = 0u; i < NumMemsInMemList; ++i) { - auto Mem = phMemList[i]; - if (auto MemDepEvent = Mem->LastEventWritingToMemObj; - MemDepEvent && - std::find(MemMigrationEvents.begin(), MemMigrationEvents.end(), - MemDepEvent) == MemMigrationEvents.end()) { - MemMigrationEvents.push_back(MemDepEvent); - MemMigrationLocks.emplace_back( - std::pair{Mem, ur_lock{Mem->MemoryMigrationMutex}}); - } - } - } - try { ScopedContext ActiveContext(hQueue->getDevice()); ScopedStream ActiveStream(hQueue, NumEventsInWaitList, phEventWaitList); std::unique_ptr RetImplEvent{nullptr}; - if (phEvent || MemMigrationEvents.size()) { + if (phEvent) { RetImplEvent = std::unique_ptr(ur_event_handle_t_::makeNative( UR_COMMAND_ENQUEUE_NATIVE_EXP, hQueue, ActiveStream.getStream())); UR_CHECK_ERROR(RetImplEvent->start()); } - if (MemMigrationEvents.size()) { - UR_CHECK_ERROR( - urEnqueueEventsWaitWithBarrier(hQueue, MemMigrationEvents.size(), - MemMigrationEvents.data(), nullptr)); + if (hQueue->getContext()->getDevices().size() > 1) { for (auto i = 0u; i < NumMemsInMemList; ++i) { - auto Mem = phMemList[i]; - migrateMemoryToDeviceIfNeeded(Mem, hQueue->getDevice()); - Mem->setLastEventWritingToMemObj(RetImplEvent.get()); + // FIXME: Update to enqueueMigrateMemory and also using + // setLastQueueWritingToMemObj when #1711 has merged + migrateMemoryToDeviceIfNeeded(phMemList[i], hQueue->getDevice()); + phMemList[i]->setLastEventWritingToMemObj(RetImplEvent.get()); } - MemMigrationLocks.clear(); } pfnNativeEnqueue(hQueue, data); // This is using urQueueGetNativeHandle to // get the CUDA stream. It must be the // same stream as is used before and after - if (phEvent || MemMigrationEvents.size()) { + if (phEvent) { UR_CHECK_ERROR(RetImplEvent->record()); - if (phEvent) { - *phEvent = RetImplEvent.release(); - } else { - // Give ownership of the event to the mem - for (auto i = 0u; i < NumMemsInMemList; ++i) { - auto Mem = phMemList[i]; - migrateMemoryToDeviceIfNeeded(Mem, hQueue->getDevice()); - Mem->setLastEventWritingToMemObj(RetImplEvent.release()); - } - } + *phEvent = RetImplEvent.release(); } } catch (ur_result_t Err) { return Err; From a2bec63617664381a1b46f9527bb217f3a679a5e Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Tue, 11 Jun 2024 13:35:21 +0100 Subject: [PATCH 08/15] Add urEnqueueNativeCommand impl for HIP adapter Add urEnqueueNativeCommand impl for HIP adapter. Also cahnge typo 'quard' to 'guard'. --- source/adapters/hip/command_buffer.cpp | 2 +- source/adapters/hip/enqueue.cpp | 8 ++--- source/adapters/hip/enqueue_native.cpp | 45 +++++++++++++++++++++++--- source/adapters/hip/queue.cpp | 10 ++++-- source/adapters/hip/queue.hpp | 33 +++++++++++++++++-- 5 files changed, 84 insertions(+), 14 deletions(-) diff --git a/source/adapters/hip/command_buffer.cpp b/source/adapters/hip/command_buffer.cpp index 38bb110154..d9438eeb9c 100644 --- a/source/adapters/hip/command_buffer.cpp +++ b/source/adapters/hip/command_buffer.cpp @@ -884,7 +884,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( std::unique_ptr RetImplEvent{nullptr}; ScopedContext Active(hQueue->getDevice()); uint32_t StreamToken; - ur_stream_quard Guard; + ur_stream_guard Guard; hipStream_t HIPStream = hQueue->getNextComputeStream( numEventsInWaitList, phEventWaitList, Guard, &StreamToken); diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp index 4fc4f95f75..7f6da7a864 100644 --- a/source/adapters/hip/enqueue.cpp +++ b/source/adapters/hip/enqueue.cpp @@ -293,7 +293,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( ScopedContext Active(Dev); uint32_t StreamToken; - ur_stream_quard Guard; + ur_stream_guard Guard; hipStream_t HIPStream = hQueue->getNextComputeStream( numEventsInWaitList, phEventWaitList, Guard, &StreamToken); @@ -380,7 +380,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( try { ScopedContext Active(hQueue->getDevice()); uint32_t StreamToken; - ur_stream_quard Guard; + ur_stream_guard Guard; hipStream_t HIPStream = hQueue->getNextComputeStream( numEventsInWaitList, reinterpret_cast(phEventWaitList), Guard, @@ -1243,7 +1243,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( try { ScopedContext Active(hQueue->getDevice()); uint32_t StreamToken; - ur_stream_quard Guard; + ur_stream_guard Guard; hipStream_t HIPStream = hQueue->getNextComputeStream( numEventsInWaitList, phEventWaitList, Guard, &StreamToken); Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, @@ -1893,7 +1893,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( ScopedContext Active(hQueue->getDevice()); uint32_t StreamToken; - ur_stream_quard Guard; + ur_stream_guard Guard; hipStream_t HIPStream = hQueue->getNextComputeStream( numEventsInWaitList, phEventWaitList, Guard, &StreamToken); UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, diff --git a/source/adapters/hip/enqueue_native.cpp b/source/adapters/hip/enqueue_native.cpp index 720ab43d3e..a7d807d176 100644 --- a/source/adapters/hip/enqueue_native.cpp +++ b/source/adapters/hip/enqueue_native.cpp @@ -10,9 +10,44 @@ #include -UR_APICALL UR_APIEXPORT ur_result_t urEnqueueNativeCommandExp( - ur_queue_handle_t, ur_exp_enqueue_native_command_function_t, void *, - const ur_exp_enqueue_native_command_properties_t *, uint32_t, - const ur_event_handle_t *, ur_event_handle_t *) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +#include "context.hpp" +#include "event.hpp" +#include "queue.hpp" + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueNativeCommandExp( + ur_queue_handle_t hQueue, + ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, void *data, + const ur_exp_enqueue_native_command_properties_t *, + uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + // TODO: how should mem migration work across a context here? + // Perhaps we will need to add a phMemObjArgs so that we are able to make + // sure memory migration happens across devices in the same context + + try { + ScopedContext ActiveContext(hQueue->getDevice()); + ScopedStream ActiveStream(hQueue, NumEventsInWaitList, phEventWaitList); + std::unique_ptr RetImplEvent{nullptr}; + + if (phEvent) { + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_ENQUEUE_NATIVE_EXP, hQueue, ActiveStream.getStream())); + UR_CHECK_ERROR(RetImplEvent->start()); + } + + pfnNativeEnqueue(hQueue, data); // This is using urQueueGetNativeHandle to + // get the CUDA stream. It must be the + // same stream as is used before and after + if (phEvent) { + UR_CHECK_ERROR(RetImplEvent->record()); + *phEvent = RetImplEvent.release(); + } + + } catch (ur_result_t Err) { + return Err; + } catch (hipError_t hipErr) { + return mapErrorUR(hipErr); + } + return UR_RESULT_SUCCESS; } diff --git a/source/adapters/hip/queue.cpp b/source/adapters/hip/queue.cpp index 8398249519..c41bc53a08 100644 --- a/source/adapters/hip/queue.cpp +++ b/source/adapters/hip/queue.cpp @@ -29,6 +29,8 @@ void ur_queue_handle_t_::transferStreamWaitForBarrierIfNeeded( } hipStream_t ur_queue_handle_t_::getNextComputeStream(uint32_t *StreamToken) { + if (getThreadLocalStream() != hipStream_t{0}) + return getThreadLocalStream(); uint32_t Stream_i; uint32_t Token; while (true) { @@ -63,7 +65,9 @@ hipStream_t ur_queue_handle_t_::getNextComputeStream(uint32_t *StreamToken) { hipStream_t ur_queue_handle_t_::getNextComputeStream( uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, - ur_stream_quard &Guard, uint32_t *StreamToken) { + ur_stream_guard &Guard, uint32_t *StreamToken) { + if (getThreadLocalStream() != hipStream_t{0}) + return getThreadLocalStream(); for (uint32_t i = 0; i < NumEventsInWaitList; i++) { uint32_t Token = EventWaitList[i]->getComputeStreamToken(); if (EventWaitList[i]->getQueue() == this && canReuseStream(Token)) { @@ -76,7 +80,7 @@ hipStream_t ur_queue_handle_t_::getNextComputeStream( if (StreamToken) { *StreamToken = Token; } - Guard = ur_stream_quard{std::move(ComputeSyncGuard)}; + Guard = ur_stream_guard{std::move(ComputeSyncGuard)}; hipStream_t Res = EventWaitList[i]->getStream(); computeStreamWaitForBarrierIfNeeded(Res, Stream_i); return Res; @@ -88,6 +92,8 @@ hipStream_t ur_queue_handle_t_::getNextComputeStream( } hipStream_t ur_queue_handle_t_::getNextTransferStream() { + if (getThreadLocalStream() != hipStream_t{0}) + return getThreadLocalStream(); if (TransferStreams.empty()) { // for example in in-order queue return getNextComputeStream(); } diff --git a/source/adapters/hip/queue.hpp b/source/adapters/hip/queue.hpp index cfabd29bf7..26fde57f13 100644 --- a/source/adapters/hip/queue.hpp +++ b/source/adapters/hip/queue.hpp @@ -14,7 +14,7 @@ #include #include -using ur_stream_quard = std::unique_lock; +using ur_stream_guard = std::unique_lock; /// UR queue mapping on to hipStream_t objects. /// @@ -97,7 +97,7 @@ struct ur_queue_handle_t_ { // returns a lock that needs to remain locked as long as the stream is in use native_type getNextComputeStream(uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, - ur_stream_quard &Guard, + ur_stream_guard &Guard, uint32_t *StreamToken = nullptr); native_type getNextTransferStream(); native_type get() { return getNextComputeStream(); }; @@ -247,6 +247,12 @@ struct ur_queue_handle_t_ { } } + // Thread local stream will be used if ScopedStream is active + static hipStream_t &getThreadLocalStream() { + static thread_local hipStream_t stream{0}; + return stream; + } + ur_context_handle_t getContext() const { return Context; }; ur_device_handle_t getDevice() const { return Device; }; @@ -261,3 +267,26 @@ struct ur_queue_handle_t_ { bool backendHasOwnership() const noexcept { return HasOwnership; } }; + +// RAII object to make hQueue stream getter methods all return the same stream +// within the lifetime of this object. +// +// This is useful for urEnqueueNativeCommandExp where we want guarantees that +// the user submitted native calls will be dispatched to a known stream, which +// must be "got" within the user submitted function. +// +// TODO: Add a test that this scoping works +class ScopedStream { + ur_queue_handle_t hQueue; + +public: + ScopedStream(ur_queue_handle_t hQueue, uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList) + : hQueue{hQueue} { + ur_stream_guard Guard; + hQueue->getThreadLocalStream() = + hQueue->getNextComputeStream(NumEventsInWaitList, EventWaitList, Guard); + } + hipStream_t getStream() { return hQueue->getThreadLocalStream(); } + ~ScopedStream() { hQueue->getThreadLocalStream() = hipStream_t{0}; } +}; From f61602977b3966d94257a49d6aa091c72c41a9a6 Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Wed, 12 Jun 2024 15:20:23 +0100 Subject: [PATCH 09/15] Change migrateMemoryToDevice to enqueue* In line with changing from a sync op to async for memory migration across devices in a context. --- source/adapters/cuda/enqueue_native.cpp | 17 ++++++++--------- source/adapters/cuda/memory.hpp | 6 +----- source/adapters/hip/enqueue_native.cpp | 10 ++++++++++ source/adapters/hip/memory.hpp | 3 --- 4 files changed, 19 insertions(+), 17 deletions(-) diff --git a/source/adapters/cuda/enqueue_native.cpp b/source/adapters/cuda/enqueue_native.cpp index f9228ec811..cf38d713b5 100644 --- a/source/adapters/cuda/enqueue_native.cpp +++ b/source/adapters/cuda/enqueue_native.cpp @@ -28,6 +28,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueNativeCommandExp( ScopedStream ActiveStream(hQueue, NumEventsInWaitList, phEventWaitList); std::unique_ptr RetImplEvent{nullptr}; + if (hQueue->getContext()->getDevices().size() > 1) { + for (auto i = 0u; i < NumMemsInMemList; ++i) { + enqueueMigrateMemoryToDeviceIfNeeded(phMemList[i], hQueue->getDevice(), + ActiveStream.getStream()); + phMemList[i]->setLastQueueWritingToMemObj(hQueue); + } + } + if (phEvent) { RetImplEvent = std::unique_ptr(ur_event_handle_t_::makeNative( @@ -35,15 +43,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueNativeCommandExp( UR_CHECK_ERROR(RetImplEvent->start()); } - if (hQueue->getContext()->getDevices().size() > 1) { - for (auto i = 0u; i < NumMemsInMemList; ++i) { - // FIXME: Update to enqueueMigrateMemory and also using - // setLastQueueWritingToMemObj when #1711 has merged - migrateMemoryToDeviceIfNeeded(phMemList[i], hQueue->getDevice()); - phMemList[i]->setLastEventWritingToMemObj(RetImplEvent.get()); - } - } - pfnNativeEnqueue(hQueue, data); // This is using urQueueGetNativeHandle to // get the CUDA stream. It must be the // same stream as is used before and after diff --git a/source/adapters/cuda/memory.hpp b/source/adapters/cuda/memory.hpp index 76c7e77753..aa992f44bf 100644 --- a/source/adapters/cuda/memory.hpp +++ b/source/adapters/cuda/memory.hpp @@ -17,8 +17,7 @@ #include "common.hpp" #include "context.hpp" -#include "device.hpp" -#include "event.hpp" +#include "queue.hpp" ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t, const ur_device_handle_t); @@ -443,6 +442,3 @@ struct ur_mem_handle_t_ { } } }; - -ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t, - const ur_device_handle_t); diff --git a/source/adapters/hip/enqueue_native.cpp b/source/adapters/hip/enqueue_native.cpp index a7d807d176..1ad6bbe2c0 100644 --- a/source/adapters/hip/enqueue_native.cpp +++ b/source/adapters/hip/enqueue_native.cpp @@ -12,11 +12,13 @@ #include "context.hpp" #include "event.hpp" +#include "memory.hpp" #include "queue.hpp" UR_APIEXPORT ur_result_t UR_APICALL urEnqueueNativeCommandExp( ur_queue_handle_t hQueue, ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, void *data, + uint32_t NumMemsInMemList, const ur_mem_handle_t *phMemList, const ur_exp_enqueue_native_command_properties_t *, uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { @@ -29,6 +31,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueNativeCommandExp( ScopedStream ActiveStream(hQueue, NumEventsInWaitList, phEventWaitList); std::unique_ptr RetImplEvent{nullptr}; + if (hQueue->getContext()->getDevices().size() > 1) { + for (auto i = 0u; i < NumMemsInMemList; ++i) { + enqueueMigrateMemoryToDeviceIfNeeded(phMemList[i], hQueue->getDevice(), + ActiveStream.getStream()); + phMemList[i]->setLastQueueWritingToMemObj(hQueue); + } + } + if (phEvent) { RetImplEvent = std::unique_ptr(ur_event_handle_t_::makeNative( diff --git a/source/adapters/hip/memory.hpp b/source/adapters/hip/memory.hpp index 2b37d36391..3ec1e8f4e9 100644 --- a/source/adapters/hip/memory.hpp +++ b/source/adapters/hip/memory.hpp @@ -437,6 +437,3 @@ struct ur_mem_handle_t_ { } } }; - -ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t, - const ur_device_handle_t); From d2576f8525451ade2d0b7a6d6106a7091e92c6d9 Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Wed, 12 Jun 2024 23:26:22 +0100 Subject: [PATCH 10/15] Add no impl for native_cpu --- source/adapters/native_cpu/enqueue.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp index 7ab9883ca7..34d0b7a4b7 100644 --- a/source/adapters/native_cpu/enqueue.cpp +++ b/source/adapters/native_cpu/enqueue.cpp @@ -710,3 +710,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueWriteHostPipe( DIE_NO_IMPLEMENTATION; } + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueNativeCommandExp( + ur_queue_handle_t, ur_exp_enqueue_native_command_function_t, void *, + uint32_t, const ur_mem_handle_t *, + const ur_exp_enqueue_native_command_properties_t *, uint32_t, + const ur_event_handle_t *, ur_event_handle_t *) { + DIE_NO_IMPLEMENTATION; +} From 378f19cc973d91ecd70ea83dc11a89b8ecd786fa Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Wed, 12 Jun 2024 23:40:51 +0100 Subject: [PATCH 11/15] Fix order of attrs --- source/adapters/level_zero/enqueue_native.cpp | 2 +- source/adapters/opencl/enqueue_native.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/source/adapters/level_zero/enqueue_native.cpp b/source/adapters/level_zero/enqueue_native.cpp index 5572bc5bff..b708333de7 100644 --- a/source/adapters/level_zero/enqueue_native.cpp +++ b/source/adapters/level_zero/enqueue_native.cpp @@ -10,7 +10,7 @@ #include -UR_APICALL UR_APIEXPORT ur_result_t urEnqueueNativeCommandExp( +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueNativeCommandExp( ur_queue_handle_t, ur_exp_enqueue_native_command_function_t, void *, uint32_t, const ur_mem_handle_t *, const ur_exp_enqueue_native_command_properties_t *, uint32_t, diff --git a/source/adapters/opencl/enqueue_native.cpp b/source/adapters/opencl/enqueue_native.cpp index 75beba8cbb..8f644971d2 100644 --- a/source/adapters/opencl/enqueue_native.cpp +++ b/source/adapters/opencl/enqueue_native.cpp @@ -10,7 +10,7 @@ #include -UR_APICALL UR_APIEXPORT ur_result_t urEnqueueNativeCommandExp( +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueNativeCommandExp( ur_queue_handle_t, ur_exp_enqueue_native_command_function_t, void *, uint32_t, const ur_mem_handle_t *, const ur_exp_enqueue_native_command_properties_t *, uint32_t, From bfcfdb20ca97be037f468e587d354f97ee4ac678 Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Fri, 14 Jun 2024 11:21:18 +0100 Subject: [PATCH 12/15] Add more tests for native enqueue Check that deps work before and after standard UR enqueue calls. --- .../enqueue_native_cuda.cpp | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/test/conformance/exp_enqueue_native/enqueue_native_cuda.cpp b/test/conformance/exp_enqueue_native/enqueue_native_cuda.cpp index e7870789d9..8029d3ce6f 100644 --- a/test/conformance/exp_enqueue_native/enqueue_native_cuda.cpp +++ b/test/conformance/exp_enqueue_native/enqueue_native_cuda.cpp @@ -88,3 +88,35 @@ TEST_P(urCudaEnqueueNativeCommandTest, Dependencies) { ASSERT_EQ(i, val); } } + +TEST_P(urCudaEnqueueNativeCommandTest, DependenciesURBefore) { + ur_event_handle_t event_1, event_2; + + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptr, sizeof(val), &val, + allocation_size, 0, + nullptr /*phEventWaitList=*/, &event_1)); + + InteropData2 data_2{device_ptr, host_vec.data()}; + ASSERT_SUCCESS(urEnqueueNativeCommandExp( + queue, &interop_func_2, &data_2, 0, nullptr /*phMemList=*/, + nullptr /*pProperties=*/, 1, &event_1, &event_2)); + urQueueFinish(queue); + for (auto &i : host_vec) { + ASSERT_EQ(i, val); + } +} + +TEST_P(urCudaEnqueueNativeCommandTest, DependenciesURAfter) { + ur_event_handle_t event_1; + + InteropData1 data_1{device_ptr}; + ASSERT_SUCCESS(urEnqueueNativeCommandExp( + queue, &interop_func_1, &data_1, 0, nullptr /*phMemList=*/, + nullptr /*pProperties=*/, 0, nullptr /*phEventWaitList=*/, &event_1)); + + urEnqueueUSMMemcpy(queue, /*blocking*/ true, host_vec.data(), device_ptr, + allocation_size, 1, &event_1, nullptr); + for (auto &i : host_vec) { + ASSERT_EQ(i, val); + } +} From 8b18b373ff829cea02f6a88bf6eef2c63124768d Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Fri, 14 Jun 2024 13:15:04 +0100 Subject: [PATCH 13/15] Add device queries for all adapters --- source/adapters/hip/device.cpp | 4 ++++ source/adapters/level_zero/device.cpp | 4 ++++ source/adapters/native_cpu/device.cpp | 4 ++++ source/adapters/opencl/device.cpp | 3 +++ 4 files changed, 15 insertions(+) diff --git a/source/adapters/hip/device.cpp b/source/adapters/hip/device.cpp index 7ab052a320..971a37117b 100644 --- a/source/adapters/hip/device.cpp +++ b/source/adapters/hip/device.cpp @@ -881,6 +881,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(false); case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: return ReturnValue(true); + case UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP: { + // HIP supports enqueueing native work through the urNativeEnqueueExp + return ReturnValue(true); + } // TODO: Investigate if this information is available on HIP. case UR_DEVICE_INFO_COMPONENT_DEVICES: diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp index 267e53ff11..45eb85dd7a 100644 --- a/source/adapters/level_zero/device.cpp +++ b/source/adapters/level_zero/device.cpp @@ -874,6 +874,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: { return ReturnValue(static_cast(true)); } + case UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP: { + // L0 doesn't support enqueueing native work through the urNativeEnqueueExp + return ReturnValue(static_cast(false)); + } case UR_DEVICE_INFO_ESIMD_SUPPORT: { // ESIMD is only supported by Intel GPUs. diff --git a/source/adapters/native_cpu/device.cpp b/source/adapters/native_cpu/device.cpp index 8571b31bfa..f4b0a3e518 100644 --- a/source/adapters/native_cpu/device.cpp +++ b/source/adapters/native_cpu/device.cpp @@ -321,6 +321,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: return ReturnValue(false); + + case UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP: + return ReturnValue(false); + default: DIE_NO_IMPLEMENTATION; } diff --git a/source/adapters/opencl/device.cpp b/source/adapters/opencl/device.cpp index 25622aea22..4445e84260 100644 --- a/source/adapters/opencl/device.cpp +++ b/source/adapters/opencl/device.cpp @@ -802,6 +802,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: { return ReturnValue(false); } + case UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP: { + return ReturnValue(false); + } case UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED: { bool Supported = false; CL_RETURN_ON_FAILURE(cl_adapter::checkDeviceExtensions( From d574d16a09e33e9c8c4b00351736cb00205c166e Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Mon, 17 Jun 2024 10:55:50 +0100 Subject: [PATCH 14/15] Don't use DIE_NO_IMPLEMENTATION --- source/adapters/native_cpu/enqueue.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp index 34d0b7a4b7..835a7febcf 100644 --- a/source/adapters/native_cpu/enqueue.cpp +++ b/source/adapters/native_cpu/enqueue.cpp @@ -716,5 +716,5 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueNativeCommandExp( uint32_t, const ur_mem_handle_t *, const ur_exp_enqueue_native_command_properties_t *, uint32_t, const ur_event_handle_t *, ur_event_handle_t *) { - DIE_NO_IMPLEMENTATION; + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } From 52a7fc69911e5c8785124bd33f16f834b4ff7c46 Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Fri, 21 Jun 2024 13:42:48 +0100 Subject: [PATCH 15/15] Add ScopedStream testing for UR CUDA adapter --- test/adapters/cuda/CMakeLists.txt | 4 ++ test/adapters/cuda/urQueueGetNativeHandle.cpp | 67 +++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 test/adapters/cuda/urQueueGetNativeHandle.cpp diff --git a/test/adapters/cuda/CMakeLists.txt b/test/adapters/cuda/CMakeLists.txt index fbc15b47e8..66c1fa4b1e 100644 --- a/test/adapters/cuda/CMakeLists.txt +++ b/test/adapters/cuda/CMakeLists.txt @@ -13,8 +13,12 @@ add_adapter_test(cuda urDeviceCreateWithNativeHandle.cpp urEventGetNativeHandle.cpp urEventCreateWithNativeHandle.cpp + urQueueGetNativeHandle.cpp kernel_tests.cpp memory_tests.cpp + #FIXME: make this cleaner + ${CMAKE_CURRENT_SOURCE_DIR}/../../../source/adapters/cuda/queue.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../../source/adapters/cuda/common.cpp ENVIRONMENT "UR_ADAPTERS_FORCE_LOAD=\"$\"" ) diff --git a/test/adapters/cuda/urQueueGetNativeHandle.cpp b/test/adapters/cuda/urQueueGetNativeHandle.cpp new file mode 100644 index 0000000000..f0c68602cc --- /dev/null +++ b/test/adapters/cuda/urQueueGetNativeHandle.cpp @@ -0,0 +1,67 @@ +// Copyright (C) 2022-2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "fixtures.h" +#include "queue.hpp" + +using urCudaQueueGetNativeHandleTest = uur::urQueueTest; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urCudaQueueGetNativeHandleTest); + +TEST_P(urCudaQueueGetNativeHandleTest, Success) { + CUstream Stream; + ASSERT_SUCCESS( + urQueueGetNativeHandle(queue, nullptr, (ur_native_handle_t *)&Stream)); + ASSERT_SUCCESS_CUDA(cuStreamSynchronize(Stream)); +} + +TEST_P(urCudaQueueGetNativeHandleTest, OutOfOrder) { + CUstream Stream; + ur_queue_properties_t props = { + /*.stype =*/UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, + /*.pNext =*/nullptr, + /*.flags =*/UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE, + }; + ASSERT_SUCCESS(urQueueCreate(context, device, &props, &queue)); + ASSERT_SUCCESS( + urQueueGetNativeHandle(queue, nullptr, (ur_native_handle_t *)&Stream)); + ASSERT_SUCCESS_CUDA(cuStreamSynchronize(Stream)); +} + +TEST_P(urCudaQueueGetNativeHandleTest, ScopedStream) { + CUstream Stream1, Stream2; + ur_queue_properties_t props = { + /*.stype =*/UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, + /*.pNext =*/nullptr, + /*.flags =*/UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE, + }; + ur_queue_handle_t OutOfOrderQueue; + ASSERT_SUCCESS(urQueueCreate(context, device, &props, &OutOfOrderQueue)); + ASSERT_SUCCESS(urQueueGetNativeHandle(OutOfOrderQueue, nullptr, + (ur_native_handle_t *)&Stream1)); + ASSERT_SUCCESS(urQueueGetNativeHandle(OutOfOrderQueue, nullptr, + (ur_native_handle_t *)&Stream2)); + + // We might want to remove this assertion at some point. This is just + // testing current implementated behaviour that getting the native OutOfOrderQueue + // will call `getNextComputeStream` + ASSERT_NE(Stream1, Stream2); + + { + ScopedStream ActiveStream(OutOfOrderQueue, 0, nullptr); + + ASSERT_SUCCESS(urQueueGetNativeHandle(OutOfOrderQueue, nullptr, + (ur_native_handle_t *)&Stream1)); + ASSERT_SUCCESS(urQueueGetNativeHandle(OutOfOrderQueue, nullptr, + (ur_native_handle_t *)&Stream2)); + ASSERT_EQ(Stream1, Stream2); + } + + // Go back to returning new streams each time + ASSERT_SUCCESS(urQueueGetNativeHandle(OutOfOrderQueue, nullptr, + (ur_native_handle_t *)&Stream1)); + ASSERT_SUCCESS(urQueueGetNativeHandle(OutOfOrderQueue, nullptr, + (ur_native_handle_t *)&Stream2)); + ASSERT_NE(Stream1, Stream2); +}