diff --git a/include/ur_api.h b/include/ur_api.h index bd69372aa7..84fe704d5f 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -225,6 +225,7 @@ typedef enum ur_function_t { UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE = 225, ///< Enumerator for ::urKernelGetSuggestedLocalWorkSize UR_FUNCTION_BINDLESS_IMAGES_IMPORT_EXTERNAL_MEMORY_EXP = 226, ///< Enumerator for ::urBindlessImagesImportExternalMemoryExp UR_FUNCTION_BINDLESS_IMAGES_IMPORT_EXTERNAL_SEMAPHORE_EXP = 227, ///< Enumerator for ::urBindlessImagesImportExternalSemaphoreExp + UR_FUNCTION_ENQUEUE_NATIVE_COMMAND_EXP = 228, ///< Enumerator for ::urEnqueueNativeCommandExp /// @cond UR_FUNCTION_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -281,6 +282,7 @@ typedef enum ur_structure_type_t { UR_STRUCTURE_TYPE_EXP_WIN32_HANDLE = 0x2004, ///< ::ur_exp_win32_handle_t UR_STRUCTURE_TYPE_EXP_SAMPLER_ADDR_MODES = 0x2005, ///< ::ur_exp_sampler_addr_modes_t UR_STRUCTURE_TYPE_EXP_SAMPLER_CUBEMAP_PROPERTIES = 0x2006, ///< ::ur_exp_sampler_cubemap_properties_t + UR_STRUCTURE_TYPE_EXP_ENQUEUE_NATIVE_COMMAND_PROPERTIES = 0x3000, ///< ::ur_exp_enqueue_native_command_properties_t /// @cond UR_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -1648,6 +1650,8 @@ typedef enum ur_device_info_t { UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP = 0x2017, ///< [::ur_bool_t] returns true if the device is capable of fetching ///< non-USM backed 3D sampled image data. UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP = 0x2018, ///< [::ur_bool_t] returns true if the device supports timestamp recording + UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP = 0x2020, ///< [::ur_bool_t] returns true if the device supports enqueueing of native + ///< work /// @cond UR_DEVICE_INFO_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -1673,7 +1677,7 @@ typedef enum ur_device_info_t { /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName` +/// + `::UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -5683,6 +5687,7 @@ typedef enum ur_command_t { UR_COMMAND_INTEROP_SEMAPHORE_WAIT_EXP = 0x2000, ///< Event created by ::urBindlessImagesWaitExternalSemaphoreExp UR_COMMAND_INTEROP_SEMAPHORE_SIGNAL_EXP = 0x2001, ///< Event created by ::urBindlessImagesSignalExternalSemaphoreExp UR_COMMAND_TIMESTAMP_RECORDING_EXP = 0x2002, ///< Event created by ::urEnqueueTimestampRecordingExp + UR_COMMAND_ENQUEUE_NATIVE_EXP = 0x2004, ///< Event created by ::urEnqueueNativeCommandExp /// @cond UR_COMMAND_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -9497,6 +9502,80 @@ urUsmP2PPeerAccessGetInfoExp( size_t *pPropSizeRet ///< [out][optional] pointer to the actual size in bytes of the queried propName. ); +#if !defined(__GNUC__) +#pragma endregion +#endif +// Intel 'oneAPI' Unified Runtime Experimental API for enqueuing work through native APIs +#if !defined(__GNUC__) +#pragma region native enqueue(experimental) +#endif +/////////////////////////////////////////////////////////////////////////////// +/// @brief Native enqueue properties +typedef uint32_t ur_exp_enqueue_native_command_flags_t; +typedef enum ur_exp_enqueue_native_command_flag_t { + UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAG_TBD = UR_BIT(0), ///< reserved for future use. + /// @cond + UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAG_FORCE_UINT32 = 0x7fffffff + /// @endcond + +} ur_exp_enqueue_native_command_flag_t; +/// @brief Bit Mask for validating ur_exp_enqueue_native_command_flags_t +#define UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAGS_MASK 0xfffffffe + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Native enqueue properties +typedef struct ur_exp_enqueue_native_command_properties_t { + ur_structure_type_t stype; ///< [in] type of this structure, must be + ///< ::UR_STRUCTURE_TYPE_EXP_ENQUEUE_NATIVE_COMMAND_PROPERTIES + void *pNext; ///< [in,out][optional] pointer to extension-specific structure + ur_exp_enqueue_native_command_flags_t flags; ///< [in] native enqueue flags + +} ur_exp_enqueue_native_command_properties_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function enqueueing work through the native API to be executed +/// immediately. +typedef void (*ur_exp_enqueue_native_command_function_t)( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + void *pUserData ///< [in][out] pointer to data to be passed to callback +); + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Immediately enqueue work through a native backend API +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hQueue` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pfnNativeEnqueue` +/// + `NULL == phEvent` +/// - ::UR_RESULT_ERROR_INVALID_ENUMERATION +/// + `NULL != pProperties && ::UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAGS_MASK & pProperties->flags` +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +UR_APIEXPORT ur_result_t UR_APICALL +urEnqueueNativeCommandExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, ///< [in] function calling the native underlying API, to be executed + ///< immediately. + void *data, ///< [in][optional] data used by pfnNativeEnqueue + uint32_t numMemsInMemList, ///< [in] size of the mem list + const ur_mem_handle_t *phMemList, ///< [in][optional][range(0, numMemsInMemList)] mems that are used within + ///< pfnNativeEnqueue using ::urMemGetNativeHandle. + ///< If nullptr, the numMemsInMemList must be 0, indicating that no mems + ///< are accessed with ::urMemGetNativeHandle within pfnNativeEnqueue. + const ur_exp_enqueue_native_command_properties_t *pProperties, ///< [in][optional] pointer to the native enqueue properties + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. + ur_event_handle_t *phEvent ///< [in,out] return an event object that identifies the work that has + ///< been enqueued in nativeEnqueueFunc. +); + #if !defined(__GNUC__) #pragma endregion #endif @@ -10916,6 +10995,22 @@ typedef struct ur_enqueue_timestamp_recording_exp_params_t { ur_event_handle_t **pphEvent; } ur_enqueue_timestamp_recording_exp_params_t; +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function parameters for urEnqueueNativeCommandExp +/// @details Each entry is a pointer to the parameter passed to the function; +/// allowing the callback the ability to modify the parameter's value +typedef struct ur_enqueue_native_command_exp_params_t { + ur_queue_handle_t *phQueue; + ur_exp_enqueue_native_command_function_t *ppfnNativeEnqueue; + void **pdata; + uint32_t *pnumMemsInMemList; + const ur_mem_handle_t **pphMemList; + const ur_exp_enqueue_native_command_properties_t **ppProperties; + uint32_t *pnumEventsInWaitList; + const ur_event_handle_t **pphEventWaitList; + ur_event_handle_t **pphEvent; +} ur_enqueue_native_command_exp_params_t; + /////////////////////////////////////////////////////////////////////////////// /// @brief Function parameters for urBindlessImagesUnsampledImageHandleDestroyExp /// @details Each entry is a pointer to the parameter passed to the function; diff --git a/include/ur_ddi.h b/include/ur_ddi.h index d429b02d68..26e2d403ac 100644 --- a/include/ur_ddi.h +++ b/include/ur_ddi.h @@ -1483,12 +1483,26 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueTimestampRecordingExp_t)( const ur_event_handle_t *, ur_event_handle_t *); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function-pointer for urEnqueueNativeCommandExp +typedef ur_result_t(UR_APICALL *ur_pfnEnqueueNativeCommandExp_t)( + ur_queue_handle_t, + ur_exp_enqueue_native_command_function_t, + void *, + uint32_t, + const ur_mem_handle_t *, + const ur_exp_enqueue_native_command_properties_t *, + uint32_t, + const ur_event_handle_t *, + ur_event_handle_t *); + /////////////////////////////////////////////////////////////////////////////// /// @brief Table of EnqueueExp functions pointers typedef struct ur_enqueue_exp_dditable_t { ur_pfnEnqueueKernelLaunchCustomExp_t pfnKernelLaunchCustomExp; ur_pfnEnqueueCooperativeKernelLaunchExp_t pfnCooperativeKernelLaunchExp; ur_pfnEnqueueTimestampRecordingExp_t pfnTimestampRecordingExp; + ur_pfnEnqueueNativeCommandExp_t pfnNativeCommandExp; } ur_enqueue_exp_dditable_t; /////////////////////////////////////////////////////////////////////////////// diff --git a/include/ur_print.h b/include/ur_print.h index b72e939f05..60aa71f03b 100644 --- a/include/ur_print.h +++ b/include/ur_print.h @@ -1042,6 +1042,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintExpLaunchProperty(const struct ur_exp /// - `buff_size < out_size` UR_APIEXPORT ur_result_t UR_APICALL urPrintExpPeerInfo(enum ur_exp_peer_info_t value, char *buffer, const size_t buff_size, size_t *out_size); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_exp_enqueue_native_command_flag_t enum +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintExpEnqueueNativeCommandFlags(enum ur_exp_enqueue_native_command_flag_t value, char *buffer, const size_t buff_size, size_t *out_size); + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_exp_enqueue_native_command_properties_t struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintExpEnqueueNativeCommandProperties(const struct ur_exp_enqueue_native_command_properties_t params, char *buffer, const size_t buff_size, size_t *out_size); + /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_loader_config_create_params_t struct /// @returns @@ -2010,6 +2026,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueCooperativeKernelLaunchExpPara /// - `buff_size < out_size` UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueTimestampRecordingExpParams(const struct ur_enqueue_timestamp_recording_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_enqueue_native_command_exp_params_t struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueNativeCommandExpParams(const struct ur_enqueue_native_command_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); + /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_bindless_images_unsampled_image_handle_destroy_exp_params_t struct /// @returns diff --git a/include/ur_print.hpp b/include/ur_print.hpp index f4e886e36b..5919e57019 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -215,6 +215,9 @@ inline ur_result_t printUnion( template <> inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_exp_peer_info_t value, size_t size); +template <> +inline ur_result_t printFlag(std::ostream &os, uint32_t flag); + } // namespace ur::details inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value); @@ -345,6 +348,8 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct inline std::ostream &operator<<(std::ostream &os, enum ur_exp_launch_property_id_t value); inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_exp_launch_property_t params); inline std::ostream &operator<<(std::ostream &os, enum ur_exp_peer_info_t value); +inline std::ostream &operator<<(std::ostream &os, enum ur_exp_enqueue_native_command_flag_t value); +inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_exp_enqueue_native_command_properties_t params); /////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_function_t type @@ -934,6 +939,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_BINDLESS_IMAGES_IMPORT_EXTERNAL_SEMAPHORE_EXP: os << "UR_FUNCTION_BINDLESS_IMAGES_IMPORT_EXTERNAL_SEMAPHORE_EXP"; break; + case UR_FUNCTION_ENQUEUE_NATIVE_COMMAND_EXP: + os << "UR_FUNCTION_ENQUEUE_NATIVE_COMMAND_EXP"; + break; default: os << "unknown enumerator"; break; @@ -1087,6 +1095,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_structure_type_t value case UR_STRUCTURE_TYPE_EXP_SAMPLER_CUBEMAP_PROPERTIES: os << "UR_STRUCTURE_TYPE_EXP_SAMPLER_CUBEMAP_PROPERTIES"; break; + case UR_STRUCTURE_TYPE_EXP_ENQUEUE_NATIVE_COMMAND_PROPERTIES: + os << "UR_STRUCTURE_TYPE_EXP_ENQUEUE_NATIVE_COMMAND_PROPERTIES"; + break; default: os << "unknown enumerator"; break; @@ -1338,6 +1349,11 @@ inline ur_result_t printStruct(std::ostream &os, const void *ptr) { const ur_exp_sampler_cubemap_properties_t *pstruct = (const ur_exp_sampler_cubemap_properties_t *)ptr; printPtr(os, pstruct); } break; + + case UR_STRUCTURE_TYPE_EXP_ENQUEUE_NATIVE_COMMAND_PROPERTIES: { + const ur_exp_enqueue_native_command_properties_t *pstruct = (const ur_exp_enqueue_native_command_properties_t *)ptr; + printPtr(os, pstruct); + } break; default: os << "unknown enumerator"; return UR_RESULT_ERROR_INVALID_ENUMERATION; @@ -2589,6 +2605,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_device_info_t value) { case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: os << "UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP"; break; + case UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP: + os << "UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP"; + break; default: os << "unknown enumerator"; break; @@ -4310,6 +4329,18 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_device_info os << ")"; } break; + case UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP: { + const ur_bool_t *tptr = (const ur_bool_t *)ptr; + if (sizeof(ur_bool_t) > size) { + os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + os << *tptr; + + os << ")"; + } break; default: os << "unknown enumerator"; return UR_RESULT_ERROR_INVALID_ENUMERATION; @@ -8815,6 +8846,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_command_t value) { case UR_COMMAND_TIMESTAMP_RECORDING_EXP: os << "UR_COMMAND_TIMESTAMP_RECORDING_EXP"; break; + case UR_COMMAND_ENQUEUE_NATIVE_EXP: + os << "UR_COMMAND_ENQUEUE_NATIVE_EXP"; + break; default: os << "unknown enumerator"; break; @@ -10023,6 +10057,78 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_exp_peer_in } } // namespace ur::details +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_exp_enqueue_native_command_flag_t type +/// @returns +/// std::ostream & +inline std::ostream &operator<<(std::ostream &os, enum ur_exp_enqueue_native_command_flag_t value) { + switch (value) { + case UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAG_TBD: + os << "UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAG_TBD"; + break; + default: + os << "unknown enumerator"; + break; + } + return os; +} + +namespace ur::details { +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_exp_enqueue_native_command_flag_t flag +template <> +inline ur_result_t printFlag(std::ostream &os, uint32_t flag) { + uint32_t val = flag; + bool first = true; + + if ((val & UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAG_TBD) == (uint32_t)UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAG_TBD) { + val ^= (uint32_t)UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAG_TBD; + if (!first) { + os << " | "; + } else { + first = false; + } + os << UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAG_TBD; + } + if (val != 0) { + std::bitset<32> bits(val); + if (!first) { + os << " | "; + } + os << "unknown bit flags " << bits; + } else if (first) { + os << "0"; + } + return UR_RESULT_SUCCESS; +} +} // namespace ur::details +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_exp_enqueue_native_command_properties_t type +/// @returns +/// std::ostream & +inline std::ostream &operator<<(std::ostream &os, const struct ur_exp_enqueue_native_command_properties_t params) { + os << "(struct ur_exp_enqueue_native_command_properties_t){"; + + os << ".stype = "; + + os << (params.stype); + + os << ", "; + os << ".pNext = "; + + ur::details::printStruct(os, + (params.pNext)); + + os << ", "; + os << ".flags = "; + + ur::details::printFlag(os, + (params.flags)); + + os << "}"; + return os; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_loader_config_create_params_t type /// @returns @@ -14418,6 +14524,78 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct return os; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_enqueue_native_command_exp_params_t type +/// @returns +/// std::ostream & +inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_enqueue_native_command_exp_params_t *params) { + + os << ".hQueue = "; + + ur::details::printPtr(os, + *(params->phQueue)); + + os << ", "; + os << ".pfnNativeEnqueue = "; + + os << reinterpret_cast( + *(params->ppfnNativeEnqueue)); + + os << ", "; + os << ".data = "; + + ur::details::printPtr(os, + *(params->pdata)); + + os << ", "; + os << ".numMemsInMemList = "; + + os << *(params->pnumMemsInMemList); + + os << ", "; + os << ".phMemList = {"; + for (size_t i = 0; *(params->pphMemList) != NULL && i < *params->pnumMemsInMemList; ++i) { + if (i != 0) { + os << ", "; + } + + ur::details::printPtr(os, + (*(params->pphMemList))[i]); + } + os << "}"; + + os << ", "; + os << ".pProperties = "; + + ur::details::printPtr(os, + *(params->ppProperties)); + + os << ", "; + os << ".numEventsInWaitList = "; + + os << *(params->pnumEventsInWaitList); + + os << ", "; + os << ".phEventWaitList = {"; + for (size_t i = 0; *(params->pphEventWaitList) != NULL && i < *params->pnumEventsInWaitList; ++i) { + if (i != 0) { + os << ", "; + } + + ur::details::printPtr(os, + (*(params->pphEventWaitList))[i]); + } + os << "}"; + + os << ", "; + os << ".phEvent = "; + + ur::details::printPtr(os, + *(params->pphEvent)); + + return os; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_bindless_images_unsampled_image_handle_destroy_exp_params_t type /// @returns @@ -17467,6 +17645,9 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, ur_function_ case UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP: { os << (const struct ur_enqueue_timestamp_recording_exp_params_t *)params; } break; + case UR_FUNCTION_ENQUEUE_NATIVE_COMMAND_EXP: { + os << (const struct ur_enqueue_native_command_exp_params_t *)params; + } break; case UR_FUNCTION_BINDLESS_IMAGES_UNSAMPLED_IMAGE_HANDLE_DESTROY_EXP: { os << (const struct ur_bindless_images_unsampled_image_handle_destroy_exp_params_t *)params; } break; diff --git a/scripts/core/EXP-NATIVE-ENQUEUE.rst b/scripts/core/EXP-NATIVE-ENQUEUE.rst new file mode 100644 index 0000000000..aba8cb6564 --- /dev/null +++ b/scripts/core/EXP-NATIVE-ENQUEUE.rst @@ -0,0 +1,85 @@ +<% + OneApi=tags['$OneApi'] + x=tags['$x'] + X=x.upper() +%> + +.. _experimental-enqueue-native-command: + +================================================================================ +Enqueue Native Command +================================================================================ + +.. warning:: + + Experimental features: + + * May be replaced, updated, or removed at any time. + * Do not require maintaining API/ABI stability of their own additions over + time. + * Do not require conformance testing of their own additions. + + +Motivation +-------------------------------------------------------------------------------- +Interop is an important use case for many programming APIs. Through +${x}EnqueueNativeCommandExp the user can immediately invoke some native API +calls in a way that the UR is aware of. In doing so, the UR adapter can +integrate its own scheduling of UR commands with native commands. + +In order for UR to guarantee correct synchronization of commands enqueued +within the native API through the function passed to +${x}EnqueueNativeCommandExp, the function argument must only use the native +queue accessed through ${x}QueueGetNativeHandle. Use of a native queue that is +not the native queue returned by ${x}QueueGetNativeHandle results in undefined +behaviour. + +Any args that are needed by the func must be passed through a void* and unpacked +within the func. If ${x}_mem_handle_t arguments are to be used within +pfnNativeEnqueue, they must be accessed using ${x}MemGetNativeHandle. +${x}_mem_handle_t arguments must be packed in the void* argument that will be +used in pfnNativeEnqueue, as well as ${x}EnqueueNativeCommandExp's phMemList +argument. + +API +-------------------------------------------------------------------------------- + +Enums +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* ${x}_device_info_t + * ${X}_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP +* ${x}_command_t + * ${X}_COMMAND_ENQUEUE_NATIVE_EXP +* ${x}_exp_enqueue_native_command_flags_t + +Types +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +${x}_exp_enqueue_native_command_properties_t + +Functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +* ${x}EnqueueNativeCommandExp + +Changelog +-------------------------------------------------------------------------------- + ++-----------+------------------------+ +| Revision | Changes | ++===========+========================+ +| 1.0 | Initial Draft | ++-----------+------------------------+ + + +Support +-------------------------------------------------------------------------------- + +Adapters which support this experimental feature *must* return true for the new +`${X}_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP` device info query. + + +Contributors +-------------------------------------------------------------------------------- + +* Hugh Delaney `hugh.delaney@codeplay.com `_ diff --git a/scripts/core/exp-native-enqueue.yml b/scripts/core/exp-native-enqueue.yml new file mode 100644 index 0000000000..5ebb9ab846 --- /dev/null +++ b/scripts/core/exp-native-enqueue.yml @@ -0,0 +1,119 @@ +# +# Copyright (C) 2024 Intel Corporation +# +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# See YaML.md for syntax definition +# +--- #-------------------------------------------------------------------------- +type: header +desc: "Intel $OneApi Unified Runtime Experimental API for enqueuing work through native APIs" +ordinal: "100" + +--- #-------------------------------------------------------------------------- +type: enum +extend: true +typed_etors: true +desc: "Extension enums to $x_device_info_t to support native enqueue." +name: $x_device_info_t +etors: + - name: ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP + value: "0x2020" + desc: "[$x_bool_t] returns true if the device supports enqueueing of native work" + +--- #-------------------------------------------------------------------------- +type: enum +extend: true +desc: "Command Type experimental enumerations." +name: $x_command_t +etors: + - name: ENQUEUE_NATIVE_EXP + value: "0x2004" + desc: Event created by $xEnqueueNativeCommandExp + +--- #-------------------------------------------------------------------------- +type: enum +desc: "Native enqueue properties" +name: $x_exp_enqueue_native_command_flags_t +etors: + - name: TBD + desc: "reserved for future use." + +--- #-------------------------------------------------------------------------- +type: struct +desc: "Native enqueue properties" +name: $x_exp_enqueue_native_command_properties_t +base: $x_base_properties_t +members: + - type: $x_exp_enqueue_native_command_flags_t + name: flags + desc: "[in] native enqueue flags" + +--- #-------------------------------------------------------------------------- +type: enum +extend: true +desc: "Structure type experimental enumerations" +name: $x_structure_type_t +etors: + - name: EXP_ENQUEUE_NATIVE_COMMAND_PROPERTIES + desc: $x_exp_enqueue_native_command_properties_t + value: "0x3000" + +--- #-------------------------------------------------------------------------- +type: fptr_typedef +desc: "Function enqueueing work through the native API to be executed immediately." +name: $x_exp_enqueue_native_command_function_t +return: void +params: + - type: $x_queue_handle_t + name: hQueue + desc: "[in] handle of the queue object" + - type: void* + name: pUserData + desc: "[in][out] pointer to data to be passed to callback" + +--- #-------------------------------------------------------------------------- +type: function +desc: "Immediately enqueue work through a native backend API" +class: $xEnqueue +name: NativeCommandExp +params: + - type: $x_queue_handle_t + name: hQueue + desc: "[in] handle of the queue object" + - type: $x_exp_enqueue_native_command_function_t + desc: "[in] function calling the native underlying API, to be executed immediately." + name: pfnNativeEnqueue + - type: void* + name: data + desc: "[in][optional] data used by pfnNativeEnqueue" + - type: uint32_t + name: numMemsInMemList + desc: "[in] size of the mem list" + - type: const $x_mem_handle_t* + name: phMemList + desc: | + [in][optional][range(0, numMemsInMemList)] mems that are used within pfnNativeEnqueue using $xMemGetNativeHandle. + If nullptr, the numMemsInMemList must be 0, indicating that no mems are accessed with $xMemGetNativeHandle within pfnNativeEnqueue. + - type: const $x_exp_enqueue_native_command_properties_t* + name: pProperties + desc: "[in][optional] pointer to the native enqueue properties" + - type: uint32_t + name: numEventsInWaitList + desc: "[in] size of the event wait list" + - type: const $x_event_handle_t* + name: phEventWaitList + desc: | + [in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. + If nullptr, the numEventsInWaitList must be 0, indicating no wait events. + - type: $x_event_handle_t* + name: phEvent + desc: | + [in,out] return an event object that identifies the work that has + been enqueued in nativeEnqueueFunc. +returns: + - $X_RESULT_ERROR_INVALID_NULL_HANDLE + - $X_RESULT_ERROR_INVALID_NULL_POINTER + - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST diff --git a/scripts/core/registry.yml b/scripts/core/registry.yml index 73f1a107d0..8157bbb08a 100644 --- a/scripts/core/registry.yml +++ b/scripts/core/registry.yml @@ -589,6 +589,9 @@ etors: - name: BINDLESS_IMAGES_IMPORT_EXTERNAL_SEMAPHORE_EXP desc: Enumerator for $xBindlessImagesImportExternalSemaphoreExp value: '227' +- name: ENQUEUE_NATIVE_COMMAND_EXP + desc: Enumerator for $xEnqueueNativeCommandExp + value: '228' --- type: enum desc: Defines structure types diff --git a/source/adapters/cuda/CMakeLists.txt b/source/adapters/cuda/CMakeLists.txt index baa67e5961..b3afb74329 100644 --- a/source/adapters/cuda/CMakeLists.txt +++ b/source/adapters/cuda/CMakeLists.txt @@ -19,6 +19,7 @@ add_ur_adapter(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/device.hpp ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp ${CMAKE_CURRENT_SOURCE_DIR}/enqueue.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_native.cpp ${CMAKE_CURRENT_SOURCE_DIR}/event.hpp ${CMAKE_CURRENT_SOURCE_DIR}/event.cpp ${CMAKE_CURRENT_SOURCE_DIR}/image.hpp diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp index b0e0a8b2d0..bd15a62504 100644 --- a/source/adapters/cuda/device.cpp +++ b/source/adapters/cuda/device.cpp @@ -951,6 +951,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, // CUDA supports recording timestamp events. return ReturnValue(true); } + case UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP: { + // CUDA supports enqueueing native work through the urNativeEnqueueExp + return ReturnValue(true); + } case UR_DEVICE_INFO_DEVICE_ID: { int Value = 0; UR_CHECK_ERROR(cuDeviceGetAttribute( diff --git a/source/adapters/cuda/enqueue_native.cpp b/source/adapters/cuda/enqueue_native.cpp new file mode 100644 index 0000000000..cf38d713b5 --- /dev/null +++ b/source/adapters/cuda/enqueue_native.cpp @@ -0,0 +1,60 @@ +//===--------- enqueue_native.cpp - CUDA Adapter --------------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#include "context.hpp" +#include "event.hpp" +#include "memory.hpp" +#include "queue.hpp" + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueNativeCommandExp( + ur_queue_handle_t hQueue, + ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, void *data, + uint32_t NumMemsInMemList, const ur_mem_handle_t *phMemList, + const ur_exp_enqueue_native_command_properties_t *, + uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + + try { + ScopedContext ActiveContext(hQueue->getDevice()); + ScopedStream ActiveStream(hQueue, NumEventsInWaitList, phEventWaitList); + std::unique_ptr RetImplEvent{nullptr}; + + if (hQueue->getContext()->getDevices().size() > 1) { + for (auto i = 0u; i < NumMemsInMemList; ++i) { + enqueueMigrateMemoryToDeviceIfNeeded(phMemList[i], hQueue->getDevice(), + ActiveStream.getStream()); + phMemList[i]->setLastQueueWritingToMemObj(hQueue); + } + } + + if (phEvent) { + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_ENQUEUE_NATIVE_EXP, hQueue, ActiveStream.getStream())); + UR_CHECK_ERROR(RetImplEvent->start()); + } + + pfnNativeEnqueue(hQueue, data); // This is using urQueueGetNativeHandle to + // get the CUDA stream. It must be the + // same stream as is used before and after + + if (phEvent) { + UR_CHECK_ERROR(RetImplEvent->record()); + *phEvent = RetImplEvent.release(); + } + } catch (ur_result_t Err) { + return Err; + } catch (CUresult CuErr) { + return mapErrorUR(CuErr); + } + return UR_RESULT_SUCCESS; +} diff --git a/source/adapters/cuda/memory.hpp b/source/adapters/cuda/memory.hpp index 76c7e77753..aa992f44bf 100644 --- a/source/adapters/cuda/memory.hpp +++ b/source/adapters/cuda/memory.hpp @@ -17,8 +17,7 @@ #include "common.hpp" #include "context.hpp" -#include "device.hpp" -#include "event.hpp" +#include "queue.hpp" ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t, const ur_device_handle_t); @@ -443,6 +442,3 @@ struct ur_mem_handle_t_ { } } }; - -ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t, - const ur_device_handle_t); diff --git a/source/adapters/cuda/queue.cpp b/source/adapters/cuda/queue.cpp index b52d21e57c..bd92a01400 100644 --- a/source/adapters/cuda/queue.cpp +++ b/source/adapters/cuda/queue.cpp @@ -33,6 +33,8 @@ void ur_queue_handle_t_::transferStreamWaitForBarrierIfNeeded( } CUstream ur_queue_handle_t_::getNextComputeStream(uint32_t *StreamToken) { + if (getThreadLocalStream() != CUstream{0}) + return getThreadLocalStream(); uint32_t StreamI; uint32_t Token; while (true) { @@ -68,6 +70,8 @@ CUstream ur_queue_handle_t_::getNextComputeStream(uint32_t *StreamToken) { CUstream ur_queue_handle_t_::getNextComputeStream( uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, ur_stream_guard_ &Guard, uint32_t *StreamToken) { + if (getThreadLocalStream() != CUstream{0}) + return getThreadLocalStream(); for (uint32_t i = 0; i < NumEventsInWaitList; i++) { uint32_t Token = EventWaitList[i]->getComputeStreamToken(); if (reinterpret_cast(EventWaitList[i]->getQueue()) == @@ -94,6 +98,8 @@ CUstream ur_queue_handle_t_::getNextComputeStream( } CUstream ur_queue_handle_t_::getNextTransferStream() { + if (getThreadLocalStream() != CUstream{0}) + return getThreadLocalStream(); if (TransferStreams.empty()) { // for example in in-order queue return getNextComputeStream(); } diff --git a/source/adapters/cuda/queue.hpp b/source/adapters/cuda/queue.hpp index 727df68573..f59947b958 100644 --- a/source/adapters/cuda/queue.hpp +++ b/source/adapters/cuda/queue.hpp @@ -101,6 +101,13 @@ struct ur_queue_handle_t_ { const ur_event_handle_t *EventWaitList, ur_stream_guard_ &Guard, uint32_t *StreamToken = nullptr); + + // Thread local stream will be used if ScopedStream is active + static CUstream &getThreadLocalStream() { + static thread_local CUstream stream{0}; + return stream; + } + native_type getNextTransferStream(); native_type get() { return getNextComputeStream(); }; ur_device_handle_t getDevice() const noexcept { return Device; }; @@ -265,3 +272,24 @@ struct ur_queue_handle_t_ { bool backendHasOwnership() const noexcept { return HasOwnership; } }; + +// RAII object to make hQueue stream getter methods all return the same stream +// within the lifetime of this object. +// +// This is useful for urEnqueueNativeCommandExp where we want guarantees that +// the user submitted native calls will be dispatched to a known stream, which +// must be "got" within the user submitted fuction. +class ScopedStream { + ur_queue_handle_t hQueue; + +public: + ScopedStream(ur_queue_handle_t hQueue, uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList) + : hQueue{hQueue} { + ur_stream_guard_ Guard; + hQueue->getThreadLocalStream() = + hQueue->getNextComputeStream(NumEventsInWaitList, EventWaitList, Guard); + } + CUstream getStream() { return hQueue->getThreadLocalStream(); } + ~ScopedStream() { hQueue->getThreadLocalStream() = CUstream{0}; } +}; diff --git a/source/adapters/cuda/ur_interface_loader.cpp b/source/adapters/cuda/ur_interface_loader.cpp index 2b8b132373..8eab4514ac 100644 --- a/source/adapters/cuda/ur_interface_loader.cpp +++ b/source/adapters/cuda/ur_interface_loader.cpp @@ -411,6 +411,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( urEnqueueCooperativeKernelLaunchExp; pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; pDdiTable->pfnKernelLaunchCustomExp = urEnqueueKernelLaunchCustomExp; + pDdiTable->pfnNativeCommandExp = urEnqueueNativeCommandExp; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/hip/CMakeLists.txt b/source/adapters/hip/CMakeLists.txt index 764cfeedf9..09c60a8e71 100644 --- a/source/adapters/hip/CMakeLists.txt +++ b/source/adapters/hip/CMakeLists.txt @@ -63,6 +63,7 @@ add_ur_adapter(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/device.hpp ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp ${CMAKE_CURRENT_SOURCE_DIR}/enqueue.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_native.cpp ${CMAKE_CURRENT_SOURCE_DIR}/event.hpp ${CMAKE_CURRENT_SOURCE_DIR}/event.cpp ${CMAKE_CURRENT_SOURCE_DIR}/image.cpp diff --git a/source/adapters/hip/command_buffer.cpp b/source/adapters/hip/command_buffer.cpp index 38bb110154..d9438eeb9c 100644 --- a/source/adapters/hip/command_buffer.cpp +++ b/source/adapters/hip/command_buffer.cpp @@ -884,7 +884,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( std::unique_ptr RetImplEvent{nullptr}; ScopedContext Active(hQueue->getDevice()); uint32_t StreamToken; - ur_stream_quard Guard; + ur_stream_guard Guard; hipStream_t HIPStream = hQueue->getNextComputeStream( numEventsInWaitList, phEventWaitList, Guard, &StreamToken); diff --git a/source/adapters/hip/device.cpp b/source/adapters/hip/device.cpp index 7ab052a320..971a37117b 100644 --- a/source/adapters/hip/device.cpp +++ b/source/adapters/hip/device.cpp @@ -881,6 +881,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(false); case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: return ReturnValue(true); + case UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP: { + // HIP supports enqueueing native work through the urNativeEnqueueExp + return ReturnValue(true); + } // TODO: Investigate if this information is available on HIP. case UR_DEVICE_INFO_COMPONENT_DEVICES: diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp index 4fc4f95f75..7f6da7a864 100644 --- a/source/adapters/hip/enqueue.cpp +++ b/source/adapters/hip/enqueue.cpp @@ -293,7 +293,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( ScopedContext Active(Dev); uint32_t StreamToken; - ur_stream_quard Guard; + ur_stream_guard Guard; hipStream_t HIPStream = hQueue->getNextComputeStream( numEventsInWaitList, phEventWaitList, Guard, &StreamToken); @@ -380,7 +380,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( try { ScopedContext Active(hQueue->getDevice()); uint32_t StreamToken; - ur_stream_quard Guard; + ur_stream_guard Guard; hipStream_t HIPStream = hQueue->getNextComputeStream( numEventsInWaitList, reinterpret_cast(phEventWaitList), Guard, @@ -1243,7 +1243,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( try { ScopedContext Active(hQueue->getDevice()); uint32_t StreamToken; - ur_stream_quard Guard; + ur_stream_guard Guard; hipStream_t HIPStream = hQueue->getNextComputeStream( numEventsInWaitList, phEventWaitList, Guard, &StreamToken); Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, @@ -1893,7 +1893,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( ScopedContext Active(hQueue->getDevice()); uint32_t StreamToken; - ur_stream_quard Guard; + ur_stream_guard Guard; hipStream_t HIPStream = hQueue->getNextComputeStream( numEventsInWaitList, phEventWaitList, Guard, &StreamToken); UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, diff --git a/source/adapters/hip/enqueue_native.cpp b/source/adapters/hip/enqueue_native.cpp new file mode 100644 index 0000000000..1ad6bbe2c0 --- /dev/null +++ b/source/adapters/hip/enqueue_native.cpp @@ -0,0 +1,63 @@ +//===--------- enqueue_native.cpp - HIP Adapter ---------------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#include "context.hpp" +#include "event.hpp" +#include "memory.hpp" +#include "queue.hpp" + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueNativeCommandExp( + ur_queue_handle_t hQueue, + ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, void *data, + uint32_t NumMemsInMemList, const ur_mem_handle_t *phMemList, + const ur_exp_enqueue_native_command_properties_t *, + uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + // TODO: how should mem migration work across a context here? + // Perhaps we will need to add a phMemObjArgs so that we are able to make + // sure memory migration happens across devices in the same context + + try { + ScopedContext ActiveContext(hQueue->getDevice()); + ScopedStream ActiveStream(hQueue, NumEventsInWaitList, phEventWaitList); + std::unique_ptr RetImplEvent{nullptr}; + + if (hQueue->getContext()->getDevices().size() > 1) { + for (auto i = 0u; i < NumMemsInMemList; ++i) { + enqueueMigrateMemoryToDeviceIfNeeded(phMemList[i], hQueue->getDevice(), + ActiveStream.getStream()); + phMemList[i]->setLastQueueWritingToMemObj(hQueue); + } + } + + if (phEvent) { + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_ENQUEUE_NATIVE_EXP, hQueue, ActiveStream.getStream())); + UR_CHECK_ERROR(RetImplEvent->start()); + } + + pfnNativeEnqueue(hQueue, data); // This is using urQueueGetNativeHandle to + // get the CUDA stream. It must be the + // same stream as is used before and after + if (phEvent) { + UR_CHECK_ERROR(RetImplEvent->record()); + *phEvent = RetImplEvent.release(); + } + + } catch (ur_result_t Err) { + return Err; + } catch (hipError_t hipErr) { + return mapErrorUR(hipErr); + } + return UR_RESULT_SUCCESS; +} diff --git a/source/adapters/hip/memory.hpp b/source/adapters/hip/memory.hpp index 2b37d36391..3ec1e8f4e9 100644 --- a/source/adapters/hip/memory.hpp +++ b/source/adapters/hip/memory.hpp @@ -437,6 +437,3 @@ struct ur_mem_handle_t_ { } } }; - -ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t, - const ur_device_handle_t); diff --git a/source/adapters/hip/queue.cpp b/source/adapters/hip/queue.cpp index 8398249519..c41bc53a08 100644 --- a/source/adapters/hip/queue.cpp +++ b/source/adapters/hip/queue.cpp @@ -29,6 +29,8 @@ void ur_queue_handle_t_::transferStreamWaitForBarrierIfNeeded( } hipStream_t ur_queue_handle_t_::getNextComputeStream(uint32_t *StreamToken) { + if (getThreadLocalStream() != hipStream_t{0}) + return getThreadLocalStream(); uint32_t Stream_i; uint32_t Token; while (true) { @@ -63,7 +65,9 @@ hipStream_t ur_queue_handle_t_::getNextComputeStream(uint32_t *StreamToken) { hipStream_t ur_queue_handle_t_::getNextComputeStream( uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, - ur_stream_quard &Guard, uint32_t *StreamToken) { + ur_stream_guard &Guard, uint32_t *StreamToken) { + if (getThreadLocalStream() != hipStream_t{0}) + return getThreadLocalStream(); for (uint32_t i = 0; i < NumEventsInWaitList; i++) { uint32_t Token = EventWaitList[i]->getComputeStreamToken(); if (EventWaitList[i]->getQueue() == this && canReuseStream(Token)) { @@ -76,7 +80,7 @@ hipStream_t ur_queue_handle_t_::getNextComputeStream( if (StreamToken) { *StreamToken = Token; } - Guard = ur_stream_quard{std::move(ComputeSyncGuard)}; + Guard = ur_stream_guard{std::move(ComputeSyncGuard)}; hipStream_t Res = EventWaitList[i]->getStream(); computeStreamWaitForBarrierIfNeeded(Res, Stream_i); return Res; @@ -88,6 +92,8 @@ hipStream_t ur_queue_handle_t_::getNextComputeStream( } hipStream_t ur_queue_handle_t_::getNextTransferStream() { + if (getThreadLocalStream() != hipStream_t{0}) + return getThreadLocalStream(); if (TransferStreams.empty()) { // for example in in-order queue return getNextComputeStream(); } diff --git a/source/adapters/hip/queue.hpp b/source/adapters/hip/queue.hpp index cfabd29bf7..26fde57f13 100644 --- a/source/adapters/hip/queue.hpp +++ b/source/adapters/hip/queue.hpp @@ -14,7 +14,7 @@ #include #include -using ur_stream_quard = std::unique_lock; +using ur_stream_guard = std::unique_lock; /// UR queue mapping on to hipStream_t objects. /// @@ -97,7 +97,7 @@ struct ur_queue_handle_t_ { // returns a lock that needs to remain locked as long as the stream is in use native_type getNextComputeStream(uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, - ur_stream_quard &Guard, + ur_stream_guard &Guard, uint32_t *StreamToken = nullptr); native_type getNextTransferStream(); native_type get() { return getNextComputeStream(); }; @@ -247,6 +247,12 @@ struct ur_queue_handle_t_ { } } + // Thread local stream will be used if ScopedStream is active + static hipStream_t &getThreadLocalStream() { + static thread_local hipStream_t stream{0}; + return stream; + } + ur_context_handle_t getContext() const { return Context; }; ur_device_handle_t getDevice() const { return Device; }; @@ -261,3 +267,26 @@ struct ur_queue_handle_t_ { bool backendHasOwnership() const noexcept { return HasOwnership; } }; + +// RAII object to make hQueue stream getter methods all return the same stream +// within the lifetime of this object. +// +// This is useful for urEnqueueNativeCommandExp where we want guarantees that +// the user submitted native calls will be dispatched to a known stream, which +// must be "got" within the user submitted function. +// +// TODO: Add a test that this scoping works +class ScopedStream { + ur_queue_handle_t hQueue; + +public: + ScopedStream(ur_queue_handle_t hQueue, uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList) + : hQueue{hQueue} { + ur_stream_guard Guard; + hQueue->getThreadLocalStream() = + hQueue->getNextComputeStream(NumEventsInWaitList, EventWaitList, Guard); + } + hipStream_t getStream() { return hQueue->getThreadLocalStream(); } + ~ScopedStream() { hQueue->getThreadLocalStream() = hipStream_t{0}; } +}; diff --git a/source/adapters/hip/ur_interface_loader.cpp b/source/adapters/hip/ur_interface_loader.cpp index 7a28623e0b..af9b8fa9c3 100644 --- a/source/adapters/hip/ur_interface_loader.cpp +++ b/source/adapters/hip/ur_interface_loader.cpp @@ -379,6 +379,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCooperativeKernelLaunchExp = urEnqueueCooperativeKernelLaunchExp; pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; + pDdiTable->pfnNativeCommandExp = urEnqueueNativeCommandExp; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt index 27de753eb8..5827452e01 100644 --- a/source/adapters/level_zero/CMakeLists.txt +++ b/source/adapters/level_zero/CMakeLists.txt @@ -116,6 +116,7 @@ add_ur_adapter(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/common.cpp ${CMAKE_CURRENT_SOURCE_DIR}/context.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_native.cpp ${CMAKE_CURRENT_SOURCE_DIR}/event.cpp ${CMAKE_CURRENT_SOURCE_DIR}/usm.cpp ${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp index 267e53ff11..45eb85dd7a 100644 --- a/source/adapters/level_zero/device.cpp +++ b/source/adapters/level_zero/device.cpp @@ -874,6 +874,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: { return ReturnValue(static_cast(true)); } + case UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP: { + // L0 doesn't support enqueueing native work through the urNativeEnqueueExp + return ReturnValue(static_cast(false)); + } case UR_DEVICE_INFO_ESIMD_SUPPORT: { // ESIMD is only supported by Intel GPUs. diff --git a/source/adapters/level_zero/enqueue_native.cpp b/source/adapters/level_zero/enqueue_native.cpp new file mode 100644 index 0000000000..b708333de7 --- /dev/null +++ b/source/adapters/level_zero/enqueue_native.cpp @@ -0,0 +1,19 @@ +//===--------- enqueue_native.cpp - LevelZero Adapter ---------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueNativeCommandExp( + ur_queue_handle_t, ur_exp_enqueue_native_command_function_t, void *, + uint32_t, const ur_mem_handle_t *, + const ur_exp_enqueue_native_command_properties_t *, uint32_t, + const ur_event_handle_t *, ur_event_handle_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/source/adapters/level_zero/ur_interface_loader.cpp b/source/adapters/level_zero/ur_interface_loader.cpp index b481ba0c6a..fb03d23c00 100644 --- a/source/adapters/level_zero/ur_interface_loader.cpp +++ b/source/adapters/level_zero/ur_interface_loader.cpp @@ -459,6 +459,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCooperativeKernelLaunchExp = urEnqueueCooperativeKernelLaunchExp; pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; + pDdiTable->pfnNativeCommandExp = urEnqueueNativeCommandExp; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/native_cpu/device.cpp b/source/adapters/native_cpu/device.cpp index 8571b31bfa..f4b0a3e518 100644 --- a/source/adapters/native_cpu/device.cpp +++ b/source/adapters/native_cpu/device.cpp @@ -321,6 +321,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: return ReturnValue(false); + + case UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP: + return ReturnValue(false); + default: DIE_NO_IMPLEMENTATION; } diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp index 7ab9883ca7..835a7febcf 100644 --- a/source/adapters/native_cpu/enqueue.cpp +++ b/source/adapters/native_cpu/enqueue.cpp @@ -710,3 +710,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueWriteHostPipe( DIE_NO_IMPLEMENTATION; } + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueNativeCommandExp( + ur_queue_handle_t, ur_exp_enqueue_native_command_function_t, void *, + uint32_t, const ur_mem_handle_t *, + const ur_exp_enqueue_native_command_properties_t *, uint32_t, + const ur_event_handle_t *, ur_event_handle_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/source/adapters/native_cpu/ur_interface_loader.cpp b/source/adapters/native_cpu/ur_interface_loader.cpp index e922dece67..7c8042202f 100644 --- a/source/adapters/native_cpu/ur_interface_loader.cpp +++ b/source/adapters/native_cpu/ur_interface_loader.cpp @@ -395,6 +395,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCooperativeKernelLaunchExp = nullptr; pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; + pDdiTable->pfnNativeCommandExp = urEnqueueNativeCommandExp; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/null/ur_nullddi.cpp b/source/adapters/null/ur_nullddi.cpp index 4a87ecdfce..e726061ee4 100644 --- a/source/adapters/null/ur_nullddi.cpp +++ b/source/adapters/null/ur_nullddi.cpp @@ -5860,6 +5860,50 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueNativeCommandExp +__urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_exp_enqueue_native_command_function_t + pfnNativeEnqueue, ///< [in] function calling the native underlying API, to be executed + ///< immediately. + void *data, ///< [in][optional] data used by pfnNativeEnqueue + uint32_t numMemsInMemList, ///< [in] size of the mem list + const ur_mem_handle_t * + phMemList, ///< [in][optional][range(0, numMemsInMemList)] mems that are used within + ///< pfnNativeEnqueue using ::urMemGetNativeHandle. + ///< If nullptr, the numMemsInMemList must be 0, indicating that no mems + ///< are accessed with ::urMemGetNativeHandle within pfnNativeEnqueue. + const ur_exp_enqueue_native_command_properties_t * + pProperties, ///< [in][optional] pointer to the native enqueue properties + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. + ur_event_handle_t * + phEvent ///< [in,out] return an event object that identifies the work that has + ///< been enqueued in nativeEnqueueFunc. + ) try { + ur_result_t result = UR_RESULT_SUCCESS; + + // if the driver has created a custom function, then call it instead of using the generic path + auto pfnNativeCommandExp = + d_context.urDdiTable.EnqueueExp.pfnNativeCommandExp; + if (nullptr != pfnNativeCommandExp) { + result = pfnNativeCommandExp( + hQueue, pfnNativeEnqueue, data, numMemsInMemList, phMemList, + pProperties, numEventsInWaitList, phEventWaitList, phEvent); + } else { + // generic implementation + *phEvent = reinterpret_cast(d_context.get()); + } + + return result; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + } // namespace driver #if defined(__cplusplus) @@ -6216,6 +6260,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnTimestampRecordingExp = driver::urEnqueueTimestampRecordingExp; + pDdiTable->pfnNativeCommandExp = driver::urEnqueueNativeCommandExp; + return result; } catch (...) { return exceptionToResult(std::current_exception()); diff --git a/source/adapters/opencl/CMakeLists.txt b/source/adapters/opencl/CMakeLists.txt index 38f8e9a523..65cbba9df6 100644 --- a/source/adapters/opencl/CMakeLists.txt +++ b/source/adapters/opencl/CMakeLists.txt @@ -25,6 +25,7 @@ add_ur_adapter(${TARGET_NAME} SHARED ${CMAKE_CURRENT_SOURCE_DIR}/device.hpp ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp ${CMAKE_CURRENT_SOURCE_DIR}/enqueue.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_native.cpp ${CMAKE_CURRENT_SOURCE_DIR}/event.cpp ${CMAKE_CURRENT_SOURCE_DIR}/image.cpp ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cpp diff --git a/source/adapters/opencl/device.cpp b/source/adapters/opencl/device.cpp index 25622aea22..4445e84260 100644 --- a/source/adapters/opencl/device.cpp +++ b/source/adapters/opencl/device.cpp @@ -802,6 +802,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: { return ReturnValue(false); } + case UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP: { + return ReturnValue(false); + } case UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED: { bool Supported = false; CL_RETURN_ON_FAILURE(cl_adapter::checkDeviceExtensions( diff --git a/source/adapters/opencl/enqueue_native.cpp b/source/adapters/opencl/enqueue_native.cpp new file mode 100644 index 0000000000..8f644971d2 --- /dev/null +++ b/source/adapters/opencl/enqueue_native.cpp @@ -0,0 +1,19 @@ +//===--------- enqueue_native.cpp - OpenCL Adapter ------------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueNativeCommandExp( + ur_queue_handle_t, ur_exp_enqueue_native_command_function_t, void *, + uint32_t, const ur_mem_handle_t *, + const ur_exp_enqueue_native_command_properties_t *, uint32_t, + const ur_event_handle_t *, ur_event_handle_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/source/adapters/opencl/ur_interface_loader.cpp b/source/adapters/opencl/ur_interface_loader.cpp index 961b04f521..f542cf2192 100644 --- a/source/adapters/opencl/ur_interface_loader.cpp +++ b/source/adapters/opencl/ur_interface_loader.cpp @@ -401,6 +401,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCooperativeKernelLaunchExp = urEnqueueCooperativeKernelLaunchExp; pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; + pDdiTable->pfnNativeCommandExp = urEnqueueNativeCommandExp; return UR_RESULT_SUCCESS; } diff --git a/source/common/stype_map_helpers.def b/source/common/stype_map_helpers.def index 0c3e5b1cc1..a6457e8716 100644 --- a/source/common/stype_map_helpers.def +++ b/source/common/stype_map_helpers.def @@ -95,4 +95,6 @@ template <> struct stype_map : stype_map_impl {}; template <> struct stype_map : stype_map_impl {}; +template <> +struct stype_map : stype_map_impl {}; diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp index 00d25a1d56..bc140f17c2 100644 --- a/source/loader/layers/tracing/ur_trcddi.cpp +++ b/source/loader/layers/tracing/ur_trcddi.cpp @@ -7844,6 +7844,68 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueNativeCommandExp +__urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_exp_enqueue_native_command_function_t + pfnNativeEnqueue, ///< [in] function calling the native underlying API, to be executed + ///< immediately. + void *data, ///< [in][optional] data used by pfnNativeEnqueue + uint32_t numMemsInMemList, ///< [in] size of the mem list + const ur_mem_handle_t * + phMemList, ///< [in][optional][range(0, numMemsInMemList)] mems that are used within + ///< pfnNativeEnqueue using ::urMemGetNativeHandle. + ///< If nullptr, the numMemsInMemList must be 0, indicating that no mems + ///< are accessed with ::urMemGetNativeHandle within pfnNativeEnqueue. + const ur_exp_enqueue_native_command_properties_t * + pProperties, ///< [in][optional] pointer to the native enqueue properties + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. + ur_event_handle_t * + phEvent ///< [in,out] return an event object that identifies the work that has + ///< been enqueued in nativeEnqueueFunc. +) { + auto pfnNativeCommandExp = + context.urDdiTable.EnqueueExp.pfnNativeCommandExp; + + if (nullptr == pfnNativeCommandExp) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + ur_enqueue_native_command_exp_params_t params = {&hQueue, + &pfnNativeEnqueue, + &data, + &numMemsInMemList, + &phMemList, + &pProperties, + &numEventsInWaitList, + &phEventWaitList, + &phEvent}; + uint64_t instance = + context.notify_begin(UR_FUNCTION_ENQUEUE_NATIVE_COMMAND_EXP, + "urEnqueueNativeCommandExp", ¶ms); + + context.logger.info("---> urEnqueueNativeCommandExp"); + + ur_result_t result = pfnNativeCommandExp( + hQueue, pfnNativeEnqueue, data, numMemsInMemList, phMemList, + pProperties, numEventsInWaitList, phEventWaitList, phEvent); + + context.notify_end(UR_FUNCTION_ENQUEUE_NATIVE_COMMAND_EXP, + "urEnqueueNativeCommandExp", ¶ms, &result, instance); + + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_NATIVE_COMMAND_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Exported function for filling application's Global table /// with current process' addresses @@ -8313,6 +8375,10 @@ __urdlllocal ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnTimestampRecordingExp = ur_tracing_layer::urEnqueueTimestampRecordingExp; + dditable.pfnNativeCommandExp = pDdiTable->pfnNativeCommandExp; + pDdiTable->pfnNativeCommandExp = + ur_tracing_layer::urEnqueueNativeCommandExp; + return result; } /////////////////////////////////////////////////////////////////////////////// diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index 043ad1a634..ce1374f0c6 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -507,7 +507,7 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetInfo( return UR_RESULT_ERROR_INVALID_NULL_POINTER; } - if (UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName) { + if (UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP < propName) { return UR_RESULT_ERROR_INVALID_ENUMERATION; } @@ -9515,6 +9515,77 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueNativeCommandExp +__urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_exp_enqueue_native_command_function_t + pfnNativeEnqueue, ///< [in] function calling the native underlying API, to be executed + ///< immediately. + void *data, ///< [in][optional] data used by pfnNativeEnqueue + uint32_t numMemsInMemList, ///< [in] size of the mem list + const ur_mem_handle_t * + phMemList, ///< [in][optional][range(0, numMemsInMemList)] mems that are used within + ///< pfnNativeEnqueue using ::urMemGetNativeHandle. + ///< If nullptr, the numMemsInMemList must be 0, indicating that no mems + ///< are accessed with ::urMemGetNativeHandle within pfnNativeEnqueue. + const ur_exp_enqueue_native_command_properties_t * + pProperties, ///< [in][optional] pointer to the native enqueue properties + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. + ur_event_handle_t * + phEvent ///< [in,out] return an event object that identifies the work that has + ///< been enqueued in nativeEnqueueFunc. +) { + auto pfnNativeCommandExp = + context.urDdiTable.EnqueueExp.pfnNativeCommandExp; + + if (nullptr == pfnNativeCommandExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + if (context.enableParameterValidation) { + if (NULL == hQueue) { + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + } + + if (NULL == pfnNativeEnqueue) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + + if (NULL == phEvent) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + + if (NULL != pProperties && + UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAGS_MASK & pProperties->flags) { + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + + if (phEventWaitList != NULL && numEventsInWaitList > 0) { + for (uint32_t i = 0; i < numEventsInWaitList; ++i) { + if (phEventWaitList[i] == NULL) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + } + } + } + + if (context.enableLifetimeValidation && + !refCountContext.isReferenceValid(hQueue)) { + refCountContext.logInvalidReference(hQueue); + } + + ur_result_t result = pfnNativeCommandExp( + hQueue, pfnNativeEnqueue, data, numMemsInMemList, phMemList, + pProperties, numEventsInWaitList, phEventWaitList, phEvent); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Exported function for filling application's Global table /// with current process' addresses @@ -9993,6 +10064,10 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnTimestampRecordingExp = ur_validation_layer::urEnqueueTimestampRecordingExp; + dditable.pfnNativeCommandExp = pDdiTable->pfnNativeCommandExp; + pDdiTable->pfnNativeCommandExp = + ur_validation_layer::urEnqueueNativeCommandExp; + return result; } diff --git a/source/loader/loader.def.in b/source/loader/loader.def.in index 41ce8f369c..45ab3b1caf 100644 --- a/source/loader/loader.def.in +++ b/source/loader/loader.def.in @@ -80,6 +80,7 @@ EXPORTS urEnqueueMemImageRead urEnqueueMemImageWrite urEnqueueMemUnmap + urEnqueueNativeCommandExp urEnqueueReadHostPipe urEnqueueTimestampRecordingExp urEnqueueUSMAdvise @@ -272,6 +273,7 @@ EXPORTS urPrintEnqueueMemImageReadParams urPrintEnqueueMemImageWriteParams urPrintEnqueueMemUnmapParams + urPrintEnqueueNativeCommandExpParams urPrintEnqueueReadHostPipeParams urPrintEnqueueTimestampRecordingExpParams urPrintEnqueueUsmAdviseParams @@ -300,6 +302,8 @@ EXPORTS urPrintExpCommandBufferUpdateMemobjArgDesc urPrintExpCommandBufferUpdatePointerArgDesc urPrintExpCommandBufferUpdateValueArgDesc + urPrintExpEnqueueNativeCommandFlags + urPrintExpEnqueueNativeCommandProperties urPrintExpExternalMemType urPrintExpExternalSemaphoreType urPrintExpFileDescriptor diff --git a/source/loader/loader.map.in b/source/loader/loader.map.in index 0cf180ab65..170365ac4b 100644 --- a/source/loader/loader.map.in +++ b/source/loader/loader.map.in @@ -80,6 +80,7 @@ urEnqueueMemImageRead; urEnqueueMemImageWrite; urEnqueueMemUnmap; + urEnqueueNativeCommandExp; urEnqueueReadHostPipe; urEnqueueTimestampRecordingExp; urEnqueueUSMAdvise; @@ -272,6 +273,7 @@ urPrintEnqueueMemImageReadParams; urPrintEnqueueMemImageWriteParams; urPrintEnqueueMemUnmapParams; + urPrintEnqueueNativeCommandExpParams; urPrintEnqueueReadHostPipeParams; urPrintEnqueueTimestampRecordingExpParams; urPrintEnqueueUsmAdviseParams; @@ -300,6 +302,8 @@ urPrintExpCommandBufferUpdateMemobjArgDesc; urPrintExpCommandBufferUpdatePointerArgDesc; urPrintExpCommandBufferUpdateValueArgDesc; + urPrintExpEnqueueNativeCommandFlags; + urPrintExpEnqueueNativeCommandProperties; urPrintExpExternalMemType; urPrintExpExternalSemaphoreType; urPrintExpFileDescriptor; diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp index d464c8782f..fd18dd4361 100644 --- a/source/loader/ur_ldrddi.cpp +++ b/source/loader/ur_ldrddi.cpp @@ -8096,6 +8096,78 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueNativeCommandExp +__urdlllocal ur_result_t UR_APICALL urEnqueueNativeCommandExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_exp_enqueue_native_command_function_t + pfnNativeEnqueue, ///< [in] function calling the native underlying API, to be executed + ///< immediately. + void *data, ///< [in][optional] data used by pfnNativeEnqueue + uint32_t numMemsInMemList, ///< [in] size of the mem list + const ur_mem_handle_t * + phMemList, ///< [in][optional][range(0, numMemsInMemList)] mems that are used within + ///< pfnNativeEnqueue using ::urMemGetNativeHandle. + ///< If nullptr, the numMemsInMemList must be 0, indicating that no mems + ///< are accessed with ::urMemGetNativeHandle within pfnNativeEnqueue. + const ur_exp_enqueue_native_command_properties_t * + pProperties, ///< [in][optional] pointer to the native enqueue properties + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. + ur_event_handle_t * + phEvent ///< [in,out] return an event object that identifies the work that has + ///< been enqueued in nativeEnqueueFunc. +) { + ur_result_t result = UR_RESULT_SUCCESS; + + // extract platform's function pointer table + auto dditable = reinterpret_cast(hQueue)->dditable; + auto pfnNativeCommandExp = dditable->ur.EnqueueExp.pfnNativeCommandExp; + if (nullptr == pfnNativeCommandExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + // convert loader handle to platform handle + hQueue = reinterpret_cast(hQueue)->handle; + + // convert loader handles to platform handles + auto phMemListLocal = std::vector(numMemsInMemList); + for (size_t i = 0; i < numMemsInMemList; ++i) { + phMemListLocal[i] = + reinterpret_cast(phMemList[i])->handle; + } + + // convert loader handles to platform handles + auto phEventWaitListLocal = + std::vector(numEventsInWaitList); + for (size_t i = 0; i < numEventsInWaitList; ++i) { + phEventWaitListLocal[i] = + reinterpret_cast(phEventWaitList[i])->handle; + } + + // forward to device-platform + result = pfnNativeCommandExp( + hQueue, pfnNativeEnqueue, data, numMemsInMemList, phMemListLocal.data(), + pProperties, numEventsInWaitList, phEventWaitListLocal.data(), phEvent); + + if (UR_RESULT_SUCCESS != result) { + return result; + } + + try { + // convert platform handle to loader handle + *phEvent = reinterpret_cast( + ur_event_factory.getInstance(*phEvent, dditable)); + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + return result; +} + } // namespace ur_loader #if defined(__cplusplus) @@ -8535,6 +8607,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( ur_loader::urEnqueueCooperativeKernelLaunchExp; pDdiTable->pfnTimestampRecordingExp = ur_loader::urEnqueueTimestampRecordingExp; + pDdiTable->pfnNativeCommandExp = + ur_loader::urEnqueueNativeCommandExp; } else { // return pointers directly to platform's DDIs *pDdiTable = diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index 1a69f86ccb..1c5e288a03 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -847,7 +847,7 @@ ur_result_t UR_APICALL urDeviceGetSelected( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName` +/// + `::UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -8918,4 +8918,56 @@ ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Immediately enqueue work through a native backend API +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hQueue` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pfnNativeEnqueue` +/// + `NULL == phEvent` +/// - ::UR_RESULT_ERROR_INVALID_ENUMERATION +/// + `NULL != pProperties && ::UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAGS_MASK & pProperties->flags` +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +ur_result_t UR_APICALL urEnqueueNativeCommandExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_exp_enqueue_native_command_function_t + pfnNativeEnqueue, ///< [in] function calling the native underlying API, to be executed + ///< immediately. + void *data, ///< [in][optional] data used by pfnNativeEnqueue + uint32_t numMemsInMemList, ///< [in] size of the mem list + const ur_mem_handle_t * + phMemList, ///< [in][optional][range(0, numMemsInMemList)] mems that are used within + ///< pfnNativeEnqueue using ::urMemGetNativeHandle. + ///< If nullptr, the numMemsInMemList must be 0, indicating that no mems + ///< are accessed with ::urMemGetNativeHandle within pfnNativeEnqueue. + const ur_exp_enqueue_native_command_properties_t * + pProperties, ///< [in][optional] pointer to the native enqueue properties + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. + ur_event_handle_t * + phEvent ///< [in,out] return an event object that identifies the work that has + ///< been enqueued in nativeEnqueueFunc. + ) try { + auto pfnNativeCommandExp = + ur_lib::context->urDdiTable.EnqueueExp.pfnNativeCommandExp; + if (nullptr == pfnNativeCommandExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + return pfnNativeCommandExp(hQueue, pfnNativeEnqueue, data, numMemsInMemList, + phMemList, pProperties, numEventsInWaitList, + phEventWaitList, phEvent); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + } // extern "C" diff --git a/source/loader/ur_print.cpp b/source/loader/ur_print.cpp index 718a65cf72..3f2d017a89 100644 --- a/source/loader/ur_print.cpp +++ b/source/loader/ur_print.cpp @@ -1052,6 +1052,22 @@ ur_result_t urPrintExpPeerInfo(enum ur_exp_peer_info_t value, char *buffer, return str_copy(&ss, buffer, buff_size, out_size); } +ur_result_t urPrintExpEnqueueNativeCommandFlags( + enum ur_exp_enqueue_native_command_flag_t value, char *buffer, + const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << value; + return str_copy(&ss, buffer, buff_size, out_size); +} + +ur_result_t urPrintExpEnqueueNativeCommandProperties( + const struct ur_exp_enqueue_native_command_properties_t params, + char *buffer, const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + ur_result_t urPrintAdapterGetParams(const struct ur_adapter_get_params_t *params, char *buffer, const size_t buff_size, @@ -1703,6 +1719,14 @@ ur_result_t urPrintEnqueueTimestampRecordingExpParams( return str_copy(&ss, buffer, buff_size, out_size); } +ur_result_t urPrintEnqueueNativeCommandExpParams( + const struct ur_enqueue_native_command_exp_params_t *params, char *buffer, + const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + ur_result_t urPrintEventGetInfoParams(const struct ur_event_get_info_params_t *params, char *buffer, const size_t buff_size, diff --git a/source/ur_api.cpp b/source/ur_api.cpp index 793c9c2f8a..f33de6539b 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -740,7 +740,7 @@ ur_result_t UR_APICALL urDeviceGetSelected( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName` +/// + `::UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -7551,3 +7551,46 @@ ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( ur_result_t result = UR_RESULT_SUCCESS; return result; } + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Immediately enqueue work through a native backend API +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hQueue` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pfnNativeEnqueue` +/// + `NULL == phEvent` +/// - ::UR_RESULT_ERROR_INVALID_ENUMERATION +/// + `NULL != pProperties && ::UR_EXP_ENQUEUE_NATIVE_COMMAND_FLAGS_MASK & pProperties->flags` +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +ur_result_t UR_APICALL urEnqueueNativeCommandExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_exp_enqueue_native_command_function_t + pfnNativeEnqueue, ///< [in] function calling the native underlying API, to be executed + ///< immediately. + void *data, ///< [in][optional] data used by pfnNativeEnqueue + uint32_t numMemsInMemList, ///< [in] size of the mem list + const ur_mem_handle_t * + phMemList, ///< [in][optional][range(0, numMemsInMemList)] mems that are used within + ///< pfnNativeEnqueue using ::urMemGetNativeHandle. + ///< If nullptr, the numMemsInMemList must be 0, indicating that no mems + ///< are accessed with ::urMemGetNativeHandle within pfnNativeEnqueue. + const ur_exp_enqueue_native_command_properties_t * + pProperties, ///< [in][optional] pointer to the native enqueue properties + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait events. + ur_event_handle_t * + phEvent ///< [in,out] return an event object that identifies the work that has + ///< been enqueued in nativeEnqueueFunc. +) { + ur_result_t result = UR_RESULT_SUCCESS; + return result; +} diff --git a/test/adapters/cuda/CMakeLists.txt b/test/adapters/cuda/CMakeLists.txt index fbc15b47e8..66c1fa4b1e 100644 --- a/test/adapters/cuda/CMakeLists.txt +++ b/test/adapters/cuda/CMakeLists.txt @@ -13,8 +13,12 @@ add_adapter_test(cuda urDeviceCreateWithNativeHandle.cpp urEventGetNativeHandle.cpp urEventCreateWithNativeHandle.cpp + urQueueGetNativeHandle.cpp kernel_tests.cpp memory_tests.cpp + #FIXME: make this cleaner + ${CMAKE_CURRENT_SOURCE_DIR}/../../../source/adapters/cuda/queue.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../../source/adapters/cuda/common.cpp ENVIRONMENT "UR_ADAPTERS_FORCE_LOAD=\"$\"" ) diff --git a/test/adapters/cuda/urQueueGetNativeHandle.cpp b/test/adapters/cuda/urQueueGetNativeHandle.cpp new file mode 100644 index 0000000000..f0c68602cc --- /dev/null +++ b/test/adapters/cuda/urQueueGetNativeHandle.cpp @@ -0,0 +1,67 @@ +// Copyright (C) 2022-2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "fixtures.h" +#include "queue.hpp" + +using urCudaQueueGetNativeHandleTest = uur::urQueueTest; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urCudaQueueGetNativeHandleTest); + +TEST_P(urCudaQueueGetNativeHandleTest, Success) { + CUstream Stream; + ASSERT_SUCCESS( + urQueueGetNativeHandle(queue, nullptr, (ur_native_handle_t *)&Stream)); + ASSERT_SUCCESS_CUDA(cuStreamSynchronize(Stream)); +} + +TEST_P(urCudaQueueGetNativeHandleTest, OutOfOrder) { + CUstream Stream; + ur_queue_properties_t props = { + /*.stype =*/UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, + /*.pNext =*/nullptr, + /*.flags =*/UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE, + }; + ASSERT_SUCCESS(urQueueCreate(context, device, &props, &queue)); + ASSERT_SUCCESS( + urQueueGetNativeHandle(queue, nullptr, (ur_native_handle_t *)&Stream)); + ASSERT_SUCCESS_CUDA(cuStreamSynchronize(Stream)); +} + +TEST_P(urCudaQueueGetNativeHandleTest, ScopedStream) { + CUstream Stream1, Stream2; + ur_queue_properties_t props = { + /*.stype =*/UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, + /*.pNext =*/nullptr, + /*.flags =*/UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE, + }; + ur_queue_handle_t OutOfOrderQueue; + ASSERT_SUCCESS(urQueueCreate(context, device, &props, &OutOfOrderQueue)); + ASSERT_SUCCESS(urQueueGetNativeHandle(OutOfOrderQueue, nullptr, + (ur_native_handle_t *)&Stream1)); + ASSERT_SUCCESS(urQueueGetNativeHandle(OutOfOrderQueue, nullptr, + (ur_native_handle_t *)&Stream2)); + + // We might want to remove this assertion at some point. This is just + // testing current implementated behaviour that getting the native OutOfOrderQueue + // will call `getNextComputeStream` + ASSERT_NE(Stream1, Stream2); + + { + ScopedStream ActiveStream(OutOfOrderQueue, 0, nullptr); + + ASSERT_SUCCESS(urQueueGetNativeHandle(OutOfOrderQueue, nullptr, + (ur_native_handle_t *)&Stream1)); + ASSERT_SUCCESS(urQueueGetNativeHandle(OutOfOrderQueue, nullptr, + (ur_native_handle_t *)&Stream2)); + ASSERT_EQ(Stream1, Stream2); + } + + // Go back to returning new streams each time + ASSERT_SUCCESS(urQueueGetNativeHandle(OutOfOrderQueue, nullptr, + (ur_native_handle_t *)&Stream1)); + ASSERT_SUCCESS(urQueueGetNativeHandle(OutOfOrderQueue, nullptr, + (ur_native_handle_t *)&Stream2)); + ASSERT_NE(Stream1, Stream2); +} diff --git a/test/conformance/CMakeLists.txt b/test/conformance/CMakeLists.txt index 439b9b7a79..a5c1c43379 100644 --- a/test/conformance/CMakeLists.txt +++ b/test/conformance/CMakeLists.txt @@ -142,6 +142,7 @@ if(UR_DPCXX) add_subdirectory(enqueue) add_subdirectory(integration) add_subdirectory(exp_command_buffer) + add_subdirectory(exp_enqueue_native) add_subdirectory(exp_usm_p2p) add_subdirectory(exp_launch_properties) add_subdirectory(memory-migrate) diff --git a/test/conformance/exp_enqueue_native/CMakeLists.txt b/test/conformance/exp_enqueue_native/CMakeLists.txt new file mode 100644 index 0000000000..f65975ea71 --- /dev/null +++ b/test/conformance/exp_enqueue_native/CMakeLists.txt @@ -0,0 +1,18 @@ +# Copyright (C) 2024 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +if (UR_BUILD_ADAPTER_CUDA) + add_conformance_test_with_kernels_environment( + exp_enqueue_native_cuda + enqueue_native_cuda.cpp + ) + target_include_directories(test-exp_enqueue_native_cuda PRIVATE + ${PROJECT_SOURCE_DIR}/source + ${PROJECT_SOURCE_DIR}/source/adapters/cuda + ) + target_link_libraries(test-exp_enqueue_native_cuda PRIVATE cudadrv) +endif() + +# TODO: Add more tests for different triples diff --git a/test/conformance/exp_enqueue_native/enqueue_native_cuda.cpp b/test/conformance/exp_enqueue_native/enqueue_native_cuda.cpp new file mode 100644 index 0000000000..8029d3ce6f --- /dev/null +++ b/test/conformance/exp_enqueue_native/enqueue_native_cuda.cpp @@ -0,0 +1,122 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include + +using T = uint32_t; + +struct urCudaEnqueueNativeCommandTest : uur::urQueueTest { + void SetUp() { + UUR_RETURN_ON_FATAL_FAILURE(uur::urQueueTest::SetUp()); + + host_vec = std::vector(global_size, 0); + ASSERT_EQ(host_vec.size(), global_size); + ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr, + allocation_size, &device_ptr)); + ASSERT_NE(device_ptr, nullptr); + } + static constexpr T val = 42; + static constexpr uint32_t global_size = 1e7; + std::vector host_vec; + void *device_ptr = nullptr; + static constexpr size_t allocation_size = sizeof(val) * global_size; +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urCudaEnqueueNativeCommandTest); + +struct InteropData1 { + void *fill_ptr; +}; + +// Fill a device ptr with the pattern val +void interop_func_1(ur_queue_handle_t hQueue, void *data) { + CUstream stream; + ASSERT_SUCCESS( + urQueueGetNativeHandle(hQueue, nullptr, (ur_native_handle_t *)&stream)); + InteropData1 *func_data = reinterpret_cast(data); + + ASSERT_EQ(cuMemsetD32Async((CUdeviceptr)func_data->fill_ptr, + urCudaEnqueueNativeCommandTest::val, + urCudaEnqueueNativeCommandTest::global_size, + stream), + CUDA_SUCCESS); +} + +struct InteropData2 { + void *from, *to; +}; + +// Read from device ptr to host ptr +void interop_func_2(ur_queue_handle_t hQueue, void *data) { + CUstream stream; + ASSERT_SUCCESS( + urQueueGetNativeHandle(hQueue, nullptr, (ur_native_handle_t *)&stream)); + InteropData2 *func_data = reinterpret_cast(data); + + ASSERT_EQ(cuMemcpyDtoHAsync(func_data->to, (CUdeviceptr)func_data->from, + urCudaEnqueueNativeCommandTest::allocation_size, + stream), + CUDA_SUCCESS); +} + +TEST_P(urCudaEnqueueNativeCommandTest, Success) { + InteropData1 data_1{device_ptr}; + ur_event_handle_t event_1; + ASSERT_SUCCESS(urEnqueueNativeCommandExp( + queue, &interop_func_1, &data_1, 0, nullptr /*phMemList=*/, + nullptr /*pProperties=*/, 0, nullptr /*phEventWaitList=*/, &event_1)); +} + +TEST_P(urCudaEnqueueNativeCommandTest, Dependencies) { + ur_event_handle_t event_1, event_2; + + InteropData1 data_1{device_ptr}; + ASSERT_SUCCESS(urEnqueueNativeCommandExp( + queue, &interop_func_1, &data_1, 0, nullptr /*phMemList=*/, + nullptr /*pProperties=*/, 0, nullptr /*phEventWaitList=*/, &event_1)); + + InteropData2 data_2{device_ptr, host_vec.data()}; + ASSERT_SUCCESS(urEnqueueNativeCommandExp( + queue, &interop_func_2, &data_2, 0, nullptr /*phMemList=*/, + nullptr /*pProperties=*/, 1, &event_1, &event_2)); + urQueueFinish(queue); + for (auto &i : host_vec) { + ASSERT_EQ(i, val); + } +} + +TEST_P(urCudaEnqueueNativeCommandTest, DependenciesURBefore) { + ur_event_handle_t event_1, event_2; + + ASSERT_SUCCESS(urEnqueueUSMFill(queue, device_ptr, sizeof(val), &val, + allocation_size, 0, + nullptr /*phEventWaitList=*/, &event_1)); + + InteropData2 data_2{device_ptr, host_vec.data()}; + ASSERT_SUCCESS(urEnqueueNativeCommandExp( + queue, &interop_func_2, &data_2, 0, nullptr /*phMemList=*/, + nullptr /*pProperties=*/, 1, &event_1, &event_2)); + urQueueFinish(queue); + for (auto &i : host_vec) { + ASSERT_EQ(i, val); + } +} + +TEST_P(urCudaEnqueueNativeCommandTest, DependenciesURAfter) { + ur_event_handle_t event_1; + + InteropData1 data_1{device_ptr}; + ASSERT_SUCCESS(urEnqueueNativeCommandExp( + queue, &interop_func_1, &data_1, 0, nullptr /*phMemList=*/, + nullptr /*pProperties=*/, 0, nullptr /*phEventWaitList=*/, &event_1)); + + urEnqueueUSMMemcpy(queue, /*blocking*/ true, host_vec.data(), device_ptr, + allocation_size, 1, &event_1, nullptr); + for (auto &i : host_vec) { + ASSERT_EQ(i, val); + } +} diff --git a/test/conformance/exp_enqueue_native/exp_enqueue_native_cuda_adapter_cuda.match b/test/conformance/exp_enqueue_native/exp_enqueue_native_cuda_adapter_cuda.match new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tools/urinfo/urinfo.hpp b/tools/urinfo/urinfo.hpp index caf61e86f6..2b1f8c89a7 100644 --- a/tools/urinfo/urinfo.hpp +++ b/tools/urinfo/urinfo.hpp @@ -404,5 +404,8 @@ inline void printDeviceInfos(ur_device_handle_t hDevice, std::cout << prefix; printDeviceInfo(hDevice, UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP); + std::cout << prefix; + printDeviceInfo( + hDevice, UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP); } } // namespace urinfo