diff --git a/source/adapters/CMakeLists.txt b/source/adapters/CMakeLists.txt index 16bbfbb964..7d21bfec9d 100644 --- a/source/adapters/CMakeLists.txt +++ b/source/adapters/CMakeLists.txt @@ -37,8 +37,6 @@ if(UR_BUILD_ADAPTER_L0) endif() if(UR_BUILD_ADAPTER_CUDA) - set(SYCL_ADAPTER_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/cuda") - FetchSource(https://github.com/intel/llvm.git ${INTEL_LLVM_TAG} "sycl/plugins/unified_runtime/ur" ${SYCL_ADAPTER_DIR}) add_subdirectory(cuda) endif() diff --git a/source/adapters/cuda/.clang-format b/source/adapters/cuda/.clang-format new file mode 100644 index 0000000000..c8daebc205 --- /dev/null +++ b/source/adapters/cuda/.clang-format @@ -0,0 +1,4 @@ +--- +Language: Cpp +BasedOnStyle: LLVM +... diff --git a/source/adapters/cuda/CMakeLists.txt b/source/adapters/cuda/CMakeLists.txt index 1222b17f67..89048bc297 100644 --- a/source/adapters/cuda/CMakeLists.txt +++ b/source/adapters/cuda/CMakeLists.txt @@ -3,43 +3,43 @@ # See LICENSE.TXT # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -set(CUDA_DIR "${SYCL_ADAPTER_DIR}/sycl/plugins/unified_runtime/ur/adapters/cuda" CACHE PATH "CUDA adapter directory") +set(CUDA_DIR "${CMAKE_CURRENT_SOURCE_DIR}" CACHE PATH "CUDA adapter directory") set(TARGET_NAME ur_adapter_cuda) add_ur_adapter(${TARGET_NAME} SHARED - ${CUDA_DIR}/ur_interface_loader.cpp - ${CUDA_DIR}/adapter.hpp - ${CUDA_DIR}/adapter.cpp - ${CUDA_DIR}/command_buffer.hpp - ${CUDA_DIR}/command_buffer.cpp - ${CUDA_DIR}/common.hpp - ${CUDA_DIR}/common.cpp - ${CUDA_DIR}/context.hpp - ${CUDA_DIR}/context.cpp - ${CUDA_DIR}/device.hpp - ${CUDA_DIR}/device.cpp - ${CUDA_DIR}/enqueue.cpp - ${CUDA_DIR}/event.hpp - ${CUDA_DIR}/event.cpp - ${CUDA_DIR}/kernel.hpp - ${CUDA_DIR}/kernel.cpp - ${CUDA_DIR}/memory.hpp - ${CUDA_DIR}/memory.cpp - ${CUDA_DIR}/platform.hpp - ${CUDA_DIR}/platform.cpp - ${CUDA_DIR}/program.hpp - ${CUDA_DIR}/program.cpp - ${CUDA_DIR}/queue.hpp - ${CUDA_DIR}/queue.cpp - ${CUDA_DIR}/sampler.hpp - ${CUDA_DIR}/sampler.cpp - ${CUDA_DIR}/tracing.cpp - ${CUDA_DIR}/usm.cpp - ${CUDA_DIR}/usm_p2p.cpp - ${CUDA_DIR}/../../ur.cpp - ${CUDA_DIR}/../../ur.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/ur_interface_loader.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/adapter.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/adapter.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/command_buffer.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/command_buffer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/common.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/common.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/context.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/context.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/enqueue.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/event.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/event.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/kernel.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/memory.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/memory.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/platform.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/platform.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/program.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/program.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/queue.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/sampler.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/sampler.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/tracing.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/usm.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.hpp ) set_target_properties(${TARGET_NAME} PROPERTIES @@ -67,6 +67,12 @@ else() ) endif() +if(UMF_ENABLE_POOL_TRACKING) + target_compile_definitions("ur_adapter_cuda" PRIVATE UMF_ENABLE_POOL_TRACKING) +else() + message(WARNING "CUDA adapter USM pools are disabled, set UMF_ENABLE_POOL_TRACKING to enable them") +endif() + target_link_libraries(${TARGET_NAME} PRIVATE ${PROJECT_NAME}::headers ${PROJECT_NAME}::common @@ -76,5 +82,5 @@ target_link_libraries(${TARGET_NAME} PRIVATE ) target_include_directories(${TARGET_NAME} PRIVATE - ${CUDA_DIR}/../../../ + "${CMAKE_CURRENT_SOURCE_DIR}/../../" ) diff --git a/source/adapters/cuda/adapter.cpp b/source/adapters/cuda/adapter.cpp new file mode 100644 index 0000000000..e1179f487d --- /dev/null +++ b/source/adapters/cuda/adapter.cpp @@ -0,0 +1,89 @@ +//===--------- adapter.cpp - CUDA Adapter ---------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#include "common.hpp" + +void enableCUDATracing(); +void disableCUDATracing(); + +struct ur_adapter_handle_t_ { + std::atomic RefCount = 0; + std::mutex Mutex; +}; + +ur_adapter_handle_t_ adapter{}; + +UR_APIEXPORT ur_result_t UR_APICALL urInit(ur_device_init_flags_t, + ur_loader_config_handle_t) { + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urTearDown(void *) { + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urAdapterGet(uint32_t NumEntries, ur_adapter_handle_t *phAdapters, + uint32_t *pNumAdapters) { + if (NumEntries > 0 && phAdapters) { + std::lock_guard Lock{adapter.Mutex}; + if (adapter.RefCount++ == 0) { + enableCUDATracing(); + } + + *phAdapters = &adapter; + } + + if (pNumAdapters) { + *pNumAdapters = 1; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urAdapterRetain(ur_adapter_handle_t) { + adapter.RefCount++; + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urAdapterRelease(ur_adapter_handle_t) { + std::lock_guard Lock{adapter.Mutex}; + if (--adapter.RefCount == 0) { + disableCUDATracing(); + } + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urAdapterGetLastError( + ur_adapter_handle_t, const char **ppMessage, int32_t *pError) { + std::ignore = pError; + *ppMessage = ErrorMessage; + return ErrorMessageCode; +} + +UR_APIEXPORT ur_result_t UR_APICALL urAdapterGetInfo(ur_adapter_handle_t, + ur_adapter_info_t propName, + size_t propSize, + void *pPropValue, + size_t *pPropSizeRet) { + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + switch (propName) { + case UR_ADAPTER_INFO_BACKEND: + return ReturnValue(UR_ADAPTER_BACKEND_CUDA); + case UR_ADAPTER_INFO_REFERENCE_COUNT: + return ReturnValue(adapter.RefCount.load()); + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + + return UR_RESULT_SUCCESS; +} diff --git a/source/adapters/cuda/adapter.hpp b/source/adapters/cuda/adapter.hpp new file mode 100644 index 0000000000..7edf36e636 --- /dev/null +++ b/source/adapters/cuda/adapter.hpp @@ -0,0 +1,11 @@ +//===--------- adapter.hpp - CUDA Adapter ---------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +struct ur_adapter_handle_t_; + +extern ur_adapter_handle_t_ adapter; diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp new file mode 100644 index 0000000000..e2e1784d13 --- /dev/null +++ b/source/adapters/cuda/command_buffer.cpp @@ -0,0 +1,253 @@ +//===--------- command_buffer.cpp - CUDA Adapter --------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "command_buffer.hpp" +#include "common.hpp" + +/// Stub implementations of UR experimental feature command-buffers + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + const ur_exp_command_buffer_desc_t *pCommandBufferDesc, + ur_exp_command_buffer_handle_t *phCommandBuffer) { + (void)hContext; + (void)hDevice; + (void)pCommandBufferDesc; + (void)phCommandBuffer; + detail::ur::die("Experimental Command-buffer feature is not " + "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { + (void)hCommandBuffer; + + detail::ur::die("Experimental Command-buffer feature is not " + "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { + (void)hCommandBuffer; + + detail::ur::die("Experimental Command-buffer feature is not " + "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) { + (void)hCommandBuffer; + + detail::ur::die("Experimental Command-buffer feature is not " + "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_kernel_handle_t hKernel, + uint32_t workDim, const size_t *pGlobalWorkOffset, + const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + (void)hCommandBuffer; + (void)hKernel; + (void)workDim; + (void)pGlobalWorkOffset; + (void)pGlobalWorkSize; + (void)pLocalWorkSize; + (void)numSyncPointsInWaitList; + (void)pSyncPointWaitList; + (void)pSyncPoint; + + detail::ur::die("Experimental Command-buffer feature is not " + "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemcpyUSMExp( + ur_exp_command_buffer_handle_t hCommandBuffer, void *pDst, const void *pSrc, + size_t size, uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + (void)hCommandBuffer; + (void)pDst; + (void)pSrc; + (void)size; + (void)numSyncPointsInWaitList; + (void)pSyncPointWaitList; + (void)pSyncPoint; + + detail::ur::die("Experimental Command-buffer feature is not " + "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hSrcMem, + ur_mem_handle_t hDstMem, size_t srcOffset, size_t dstOffset, size_t size, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + (void)hCommandBuffer; + (void)hSrcMem; + (void)hDstMem; + (void)srcOffset; + (void)dstOffset; + (void)size; + (void)numSyncPointsInWaitList; + (void)pSyncPointWaitList; + (void)pSyncPoint; + + detail::ur::die("Experimental Command-buffer feature is not " + "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyRectExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hSrcMem, + ur_mem_handle_t hDstMem, ur_rect_offset_t srcOrigin, + ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, + size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + (void)hCommandBuffer; + (void)hSrcMem; + (void)hDstMem; + (void)srcOrigin; + (void)dstOrigin; + (void)region; + (void)srcRowPitch; + (void)srcSlicePitch; + (void)dstRowPitch; + (void)dstSlicePitch; + (void)numSyncPointsInWaitList; + (void)pSyncPointWaitList; + (void)pSyncPoint; + + detail::ur::die("Experimental Command-buffer feature is not " + "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT +ur_result_t UR_APICALL urCommandBufferAppendMembufferWriteExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, + size_t offset, size_t size, const void *pSrc, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + (void)hCommandBuffer; + (void)hBuffer; + (void)offset; + (void)size; + (void)pSrc; + (void)numSyncPointsInWaitList; + (void)pSyncPointWaitList; + (void)pSyncPoint; + + detail::ur::die("Experimental Command-buffer feature is not " + "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT +ur_result_t UR_APICALL urCommandBufferAppendMembufferReadExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, + size_t offset, size_t size, void *pDst, uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + (void)hCommandBuffer; + (void)hBuffer; + (void)offset; + (void)size; + (void)pDst; + (void)numSyncPointsInWaitList; + (void)pSyncPointWaitList; + (void)pSyncPoint; + + detail::ur::die("Experimental Command-buffer feature is not " + "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT +ur_result_t UR_APICALL urCommandBufferAppendMembufferWriteRectExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, + ur_rect_offset_t bufferOffset, ur_rect_offset_t hostOffset, + ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, + size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + (void)hCommandBuffer; + (void)hBuffer; + (void)bufferOffset; + (void)hostOffset; + (void)region; + (void)bufferRowPitch; + (void)bufferSlicePitch; + (void)hostRowPitch; + (void)hostSlicePitch; + (void)pSrc; + (void)numSyncPointsInWaitList; + (void)pSyncPointWaitList; + (void)pSyncPoint; + + detail::ur::die("Experimental Command-buffer feature is not " + "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT +ur_result_t UR_APICALL urCommandBufferAppendMembufferReadRectExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, + ur_rect_offset_t bufferOffset, ur_rect_offset_t hostOffset, + ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, + size_t hostRowPitch, size_t hostSlicePitch, void *pDst, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint) { + (void)hCommandBuffer; + (void)hBuffer; + (void)bufferOffset; + (void)hostOffset; + (void)region; + (void)bufferRowPitch; + (void)bufferSlicePitch; + (void)hostRowPitch; + (void)hostSlicePitch; + (void)pDst; + + (void)numSyncPointsInWaitList; + (void)pSyncPointWaitList; + (void)pSyncPoint; + + detail::ur::die("Experimental Command-buffer feature is not " + "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + (void)hCommandBuffer; + (void)hQueue; + (void)numEventsInWaitList; + (void)phEventWaitList; + (void)phEvent; + + detail::ur::die("Experimental Command-buffer feature is not " + "implemented for CUDA adapter."); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/source/adapters/cuda/command_buffer.hpp b/source/adapters/cuda/command_buffer.hpp new file mode 100644 index 0000000000..31ea4372ea --- /dev/null +++ b/source/adapters/cuda/command_buffer.hpp @@ -0,0 +1,13 @@ +//===--------- command_buffer.hpp - CUDA Adapter --------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +/// Stub implementation of command-buffers for CUDA + +struct ur_exp_command_buffer_handle_t_ {}; diff --git a/source/adapters/cuda/common.cpp b/source/adapters/cuda/common.cpp new file mode 100644 index 0000000000..5fcfe5993e --- /dev/null +++ b/source/adapters/cuda/common.cpp @@ -0,0 +1,139 @@ +//===--------- common.cpp - CUDA Adapter ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "common.hpp" + +#include + +#include + +ur_result_t mapErrorUR(CUresult Result) { + switch (Result) { + case CUDA_SUCCESS: + return UR_RESULT_SUCCESS; + case CUDA_ERROR_NOT_PERMITTED: + return UR_RESULT_ERROR_INVALID_OPERATION; + case CUDA_ERROR_INVALID_CONTEXT: + return UR_RESULT_ERROR_INVALID_CONTEXT; + case CUDA_ERROR_INVALID_DEVICE: + return UR_RESULT_ERROR_INVALID_DEVICE; + case CUDA_ERROR_INVALID_VALUE: + return UR_RESULT_ERROR_INVALID_VALUE; + case CUDA_ERROR_OUT_OF_MEMORY: + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: + return UR_RESULT_ERROR_OUT_OF_RESOURCES; + default: + return UR_RESULT_ERROR_UNKNOWN; + } +} + +void checkErrorUR(CUresult Result, const char *Function, int Line, + const char *File) { + if (Result == CUDA_SUCCESS || Result == CUDA_ERROR_DEINITIALIZED) { + return; + } + + if (std::getenv("SYCL_PI_SUPPRESS_ERROR_MESSAGE") == nullptr && + std::getenv("UR_SUPPRESS_ERROR_MESSAGE") == nullptr) { + const char *ErrorString = nullptr; + const char *ErrorName = nullptr; + cuGetErrorName(Result, &ErrorName); + cuGetErrorString(Result, &ErrorString); + std::stringstream SS; + SS << "\nUR CUDA ERROR:" + << "\n\tValue: " << Result + << "\n\tName: " << ErrorName + << "\n\tDescription: " << ErrorString + << "\n\tFunction: " << Function << "\n\tSource Location: " << File + << ":" << Line << "\n" + << std::endl; + std::cerr << SS.str(); + } + + if (std::getenv("PI_CUDA_ABORT") != nullptr || + std::getenv("UR_CUDA_ABORT") != nullptr) { + std::abort(); + } + + throw mapErrorUR(Result); +} + +void checkErrorUR(ur_result_t Result, const char *Function, int Line, + const char *File) { + if (Result == UR_RESULT_SUCCESS) { + return; + } + + if (std::getenv("SYCL_PI_SUPPRESS_ERROR_MESSAGE") == nullptr && + std::getenv("UR_SUPPRESS_ERROR_MESSAGE") == nullptr) { + std::stringstream SS; + SS << "\nUR ERROR:" + << "\n\tValue: " << Result + << "\n\tFunction: " << Function << "\n\tSource Location: " << File + << ":" << Line << "\n" + << std::endl; + std::cerr << SS.str(); + } + + if (std::getenv("PI_CUDA_ABORT") != nullptr) { + std::abort(); + } + + throw Result; +} + +std::string getCudaVersionString() { + int driver_version = 0; + cuDriverGetVersion(&driver_version); + // The version is returned as (1000 major + 10 minor). + std::stringstream stream; + stream << "CUDA " << driver_version / 1000 << "." + << driver_version % 1000 / 10; + return stream.str(); +} + +void detail::ur::die(const char *Message) { + std::cerr << "ur_die: " << Message << std::endl; + std::terminate(); +} + +void detail::ur::assertion(bool Condition, const char *Message) { + if (!Condition) + die(Message); +} + +void detail::ur::cuPrint(const char *Message) { + std::cerr << "ur_print: " << Message << std::endl; +} + +// Global variables for ZER_EXT_RESULT_ADAPTER_SPECIFIC_ERROR +thread_local ur_result_t ErrorMessageCode = UR_RESULT_SUCCESS; +thread_local char ErrorMessage[MaxMessageSize]; + +// Utility function for setting a message and warning +[[maybe_unused]] void setErrorMessage(const char *pMessage, + ur_result_t ErrorCode) { + assert(strlen(pMessage) <= MaxMessageSize); + strcpy(ErrorMessage, pMessage); + ErrorMessageCode = ErrorCode; +} + +void setPluginSpecificMessage(CUresult cu_res) { + const char *error_string; + const char *error_name; + cuGetErrorName(cu_res, &error_name); + cuGetErrorString(cu_res, &error_string); + char *message = (char *)malloc(strlen(error_string) + strlen(error_name) + 2); + strcpy(message, error_name); + strcat(message, "\n"); + strcat(message, error_string); + + setErrorMessage(message, UR_RESULT_ERROR_ADAPTER_SPECIFIC); + free(message); +} diff --git a/source/adapters/cuda/common.hpp b/source/adapters/cuda/common.hpp new file mode 100644 index 0000000000..1f73a7030e --- /dev/null +++ b/source/adapters/cuda/common.hpp @@ -0,0 +1,59 @@ +//===--------- common.hpp - CUDA Adapter ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#pragma once + +#include +#include + +ur_result_t mapErrorUR(CUresult Result); + +/// Converts CUDA error into UR error codes, and outputs error information +/// to stderr. +/// If PI_CUDA_ABORT env variable is defined, it aborts directly instead of +/// throwing the error. This is intended for debugging purposes. +/// \return UR_RESULT_SUCCESS if \param Result was CUDA_SUCCESS. +/// \throw ur_result_t exception (integer) if input was not success. +/// +void checkErrorUR(CUresult Result, const char *Function, int Line, + const char *File); + +void checkErrorUR(ur_result_t Result, const char *Function, int Line, + const char *File); + +#define UR_CHECK_ERROR(Result) \ + checkErrorUR(Result, __func__, __LINE__, __FILE__) + +std::string getCudaVersionString(); + +constexpr size_t MaxMessageSize = 256; +extern thread_local ur_result_t ErrorMessageCode; +extern thread_local char ErrorMessage[MaxMessageSize]; + +// Utility function for setting a message and warning +[[maybe_unused]] void setErrorMessage(const char *pMessage, + ur_result_t ErrorCode); + +void setPluginSpecificMessage(CUresult cu_res); + +/// ------ Error handling, matching OpenCL plugin semantics. +namespace detail { +namespace ur { + +// Report error and no return (keeps compiler from printing warnings). +// TODO: Probably change that to throw a catchable exception, +// but for now it is useful to see every failure. +// +[[noreturn]] void die(const char *Message); + +// Reports error messages +void cuPrint(const char *Message); + +void assertion(bool Condition, const char *Message = nullptr); + +} // namespace ur +} // namespace detail diff --git a/source/adapters/cuda/context.cpp b/source/adapters/cuda/context.cpp new file mode 100644 index 0000000000..179902a538 --- /dev/null +++ b/source/adapters/cuda/context.cpp @@ -0,0 +1,161 @@ +//===--------- context.cpp - CUDA Adapter ---------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "context.hpp" +#include "usm.hpp" + +#include + +void ur_context_handle_t_::addPool(ur_usm_pool_handle_t Pool) { + std::lock_guard Lock(Mutex); + PoolHandles.insert(Pool); +} + +void ur_context_handle_t_::removePool(ur_usm_pool_handle_t Pool) { + std::lock_guard Lock(Mutex); + PoolHandles.erase(Pool); +} + +ur_usm_pool_handle_t +ur_context_handle_t_::getOwningURPool(umf_memory_pool_t *UMFPool) { + std::lock_guard Lock(Mutex); + for (auto &Pool : PoolHandles) { + if (Pool->hasUMFPool(UMFPool)) { + return Pool; + } + } + return nullptr; +} + +/// Create a UR CUDA context. +/// +/// By default creates a scoped context and keeps the last active CUDA context +/// on top of the CUDA context stack. +/// With the __SYCL_PI_CONTEXT_PROPERTIES_CUDA_PRIMARY key/id and a value of +/// PI_TRUE creates a primary CUDA context and activates it on the CUDA context +/// stack. +/// +UR_APIEXPORT ur_result_t UR_APICALL +urContextCreate(uint32_t DeviceCount, const ur_device_handle_t *phDevices, + const ur_context_properties_t *pProperties, + ur_context_handle_t *phContext) { + std::ignore = DeviceCount; + std::ignore = pProperties; + + assert(DeviceCount == 1); + ur_result_t RetErr = UR_RESULT_SUCCESS; + + std::unique_ptr ContextPtr{nullptr}; + try { + ContextPtr = std::unique_ptr( + new ur_context_handle_t_{*phDevices}); + *phContext = ContextPtr.release(); + } catch (ur_result_t Err) { + RetErr = Err; + } catch (...) { + RetErr = UR_RESULT_ERROR_OUT_OF_RESOURCES; + } + return RetErr; +} + +UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo( + ur_context_handle_t hContext, ur_context_info_t ContextInfoType, + size_t propSize, void *pContextInfo, size_t *pPropSizeRet) { + UrReturnHelper ReturnValue(propSize, pContextInfo, pPropSizeRet); + + switch (static_cast(ContextInfoType)) { + case UR_CONTEXT_INFO_NUM_DEVICES: + return ReturnValue(1); + case UR_CONTEXT_INFO_DEVICES: + return ReturnValue(hContext->getDevice()); + case UR_CONTEXT_INFO_REFERENCE_COUNT: + return ReturnValue(hContext->getReferenceCount()); + case UR_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: { + uint32_t Capabilities = UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | + UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE | + UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE | + UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL; + return ReturnValue(Capabilities); + } + case UR_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: { + int Major = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + hContext->getDevice()->get())); + uint32_t Capabilities = + (Major >= 7) ? UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_SYSTEM + : UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE; + return ReturnValue(Capabilities); + } + case UR_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT: + // 2D USM memcpy is supported. + return ReturnValue(true); + case UR_CONTEXT_INFO_USM_FILL2D_SUPPORT: + // 2D USM operations currently not supported. + return ReturnValue(false); + + default: + break; + } + + return UR_RESULT_ERROR_INVALID_ENUMERATION; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urContextRelease(ur_context_handle_t hContext) { + if (hContext->decrementReferenceCount() > 0) { + return UR_RESULT_SUCCESS; + } + hContext->invokeExtendedDeleters(); + + std::unique_ptr Context{hContext}; + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urContextRetain(ur_context_handle_t hContext) { + assert(hContext->getReferenceCount() > 0); + + hContext->incrementReferenceCount(); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urContextGetNativeHandle( + ur_context_handle_t hContext, ur_native_handle_t *phNativeContext) { + *phNativeContext = reinterpret_cast(hContext->get()); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urContextCreateWithNativeHandle( + ur_native_handle_t hNativeContext, uint32_t numDevices, + const ur_device_handle_t *phDevices, + const ur_context_native_properties_t *pProperties, + ur_context_handle_t *phContext) { + std::ignore = hNativeContext; + std::ignore = numDevices; + std::ignore = phDevices; + std::ignore = pProperties; + std::ignore = phContext; + + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urContextSetExtendedDeleter( + ur_context_handle_t hContext, ur_context_extended_deleter_t pfnDeleter, + void *pUserData) { + hContext->setExtendedDeleter(pfnDeleter, pUserData); + return UR_RESULT_SUCCESS; +} diff --git a/source/adapters/cuda/context.hpp b/source/adapters/cuda/context.hpp new file mode 100644 index 0000000000..a321c14894 --- /dev/null +++ b/source/adapters/cuda/context.hpp @@ -0,0 +1,149 @@ +//===--------- context.hpp - CUDA Adapter ---------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#pragma once + +#include +#include + +#include +#include +#include +#include + +#include "common.hpp" +#include "device.hpp" + +#include + +typedef void (*ur_context_extended_deleter_t)(void *user_data); + +/// UR context mapping to a CUDA context object. +/// +/// There is no direct mapping between a CUDA context and a UR context. +/// The main differences are described below: +/// +/// CUDA context vs UR context +/// +/// One of the main differences between the UR API and the CUDA driver API is +/// that the second modifies the state of the threads by assigning +/// `CUcontext` objects to threads. `CUcontext` objects store data associated +/// with a given device and control access to said device from the user side. +/// UR API context are objects that are passed to functions, and not bound +/// to threads. +/// The ur_context_handle_t_ object doesn't implement this behavior. It only +/// holds the CUDA context data. The RAII object \ref ScopedContext implements +/// the active context behavior. +/// +/// Primary vs User-defined context +/// +/// CUDA has two different types of context, the Primary context, +/// which is usable by all threads on a given process for a given device, and +/// the aforementioned custom contexts. +/// The CUDA documentation, confirmed with performance analysis, suggest using +/// the Primary context whenever possible. +/// The Primary context is also used by the CUDA Runtime API. +/// For UR applications to interop with CUDA Runtime API, they have to use +/// the primary context - and make that active in the thread. +/// The `ur_context_handle_t_` object can be constructed with a `kind` parameter +/// that allows to construct a Primary or `user-defined` context, so that +/// the UR object interface is always the same. +/// +/// Destructor callback +/// +/// Required to implement CP023, SYCL Extended Context Destruction, +/// the PI Context can store a number of callback functions that will be +/// called upon destruction of the UR Context. +/// See proposal for details. +/// https://github.com/codeplaysoftware/standards-proposals/blob/master/extended-context-destruction/index.md +/// +struct ur_context_handle_t_ { + + struct deleter_data { + ur_context_extended_deleter_t Function; + void *UserData; + + void operator()() { Function(UserData); } + }; + + using native_type = CUcontext; + + native_type CUContext; + ur_device_handle_t DeviceID; + std::atomic_uint32_t RefCount; + + ur_context_handle_t_(ur_device_handle_t_ *DevID) + : CUContext{DevID->getContext()}, DeviceID{DevID}, RefCount{1} { + urDeviceRetain(DeviceID); + }; + + ~ur_context_handle_t_() { urDeviceRelease(DeviceID); } + + void invokeExtendedDeleters() { + std::lock_guard Guard(Mutex); + for (auto &Deleter : ExtendedDeleters) { + Deleter(); + } + } + + void setExtendedDeleter(ur_context_extended_deleter_t Function, + void *UserData) { + std::lock_guard Guard(Mutex); + ExtendedDeleters.emplace_back(deleter_data{Function, UserData}); + } + + ur_device_handle_t getDevice() const noexcept { return DeviceID; } + + native_type get() const noexcept { return CUContext; } + + uint32_t incrementReferenceCount() noexcept { return ++RefCount; } + + uint32_t decrementReferenceCount() noexcept { return --RefCount; } + + uint32_t getReferenceCount() const noexcept { return RefCount; } + + void addPool(ur_usm_pool_handle_t Pool); + + void removePool(ur_usm_pool_handle_t Pool); + + ur_usm_pool_handle_t getOwningURPool(umf_memory_pool_t *UMFPool); + +private: + std::mutex Mutex; + std::vector ExtendedDeleters; + std::set PoolHandles; +}; + +namespace { +class ScopedContext { +public: + ScopedContext(ur_context_handle_t Context) { + if (!Context) { + throw UR_RESULT_ERROR_INVALID_CONTEXT; + } + + setContext(Context->get()); + } + + ScopedContext(CUcontext NativeContext) { setContext(NativeContext); } + + ~ScopedContext() {} + +private: + void setContext(CUcontext Desired) { + CUcontext Original = nullptr; + + UR_CHECK_ERROR(cuCtxGetCurrent(&Original)); + + // Make sure the desired context is active on the current thread, setting + // it if necessary + if (Original != Desired) { + UR_CHECK_ERROR(cuCtxSetCurrent(Desired)); + } + } +}; +} // namespace diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp new file mode 100644 index 0000000000..ece3dca15a --- /dev/null +++ b/source/adapters/cuda/device.cpp @@ -0,0 +1,1212 @@ +//===--------- device.cpp - CUDA Adapter ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +#include "adapter.hpp" +#include "context.hpp" +#include "device.hpp" +#include "platform.hpp" + +int getAttribute(ur_device_handle_t device, CUdevice_attribute attribute) { + int value; + + UR_CHECK_ERROR(cuDeviceGetAttribute(&value, attribute, device->get())); + return value; +} + +uint64_t ur_device_handle_t_::getElapsedTime(CUevent ev) const { + float Milliseconds = 0.0f; + + // cuEventSynchronize waits till the event is ready for call to + // cuEventElapsedTime. + UR_CHECK_ERROR(cuEventSynchronize(EvBase)); + UR_CHECK_ERROR(cuEventSynchronize(ev)); + UR_CHECK_ERROR(cuEventElapsedTime(&Milliseconds, EvBase, ev)); + + return static_cast(Milliseconds * 1.0e6); +} + +UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + ur_device_info_t propName, + size_t propSize, + void *pPropValue, + size_t *pPropSizeRet) { + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + static constexpr uint32_t MaxWorkItemDimensions = 3u; + + ScopedContext Active(hDevice->getContext()); + + switch ((uint32_t)propName) { + case UR_DEVICE_INFO_TYPE: { + return ReturnValue(UR_DEVICE_TYPE_GPU); + } + case UR_DEVICE_INFO_VENDOR_ID: { + return ReturnValue(4318u); + } + case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: { + int ComputeUnits = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &ComputeUnits, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, + hDevice->get())); + detail::ur::assertion(ComputeUnits >= 0); + return ReturnValue(static_cast(ComputeUnits)); + } + case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: { + return ReturnValue(MaxWorkItemDimensions); + } + case UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES: { + struct { + size_t Sizes[MaxWorkItemDimensions]; + } ReturnSizes; + + int MaxX = 0, MaxY = 0, MaxZ = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &MaxX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, hDevice->get())); + detail::ur::assertion(MaxX >= 0); + + UR_CHECK_ERROR(cuDeviceGetAttribute( + &MaxY, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, hDevice->get())); + detail::ur::assertion(MaxY >= 0); + + UR_CHECK_ERROR(cuDeviceGetAttribute( + &MaxZ, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, hDevice->get())); + detail::ur::assertion(MaxZ >= 0); + + ReturnSizes.Sizes[0] = size_t(MaxX); + ReturnSizes.Sizes[1] = size_t(MaxY); + ReturnSizes.Sizes[2] = size_t(MaxZ); + return ReturnValue(ReturnSizes); + } + + case UR_DEVICE_INFO_MAX_WORK_GROUPS_3D: { + struct { + size_t Sizes[MaxWorkItemDimensions]; + } ReturnSizes; + int MaxX = 0, MaxY = 0, MaxZ = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &MaxX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, hDevice->get())); + detail::ur::assertion(MaxX >= 0); + + UR_CHECK_ERROR(cuDeviceGetAttribute( + &MaxY, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, hDevice->get())); + detail::ur::assertion(MaxY >= 0); + + UR_CHECK_ERROR(cuDeviceGetAttribute( + &MaxZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, hDevice->get())); + detail::ur::assertion(MaxZ >= 0); + + ReturnSizes.Sizes[0] = size_t(MaxX); + ReturnSizes.Sizes[1] = size_t(MaxY); + ReturnSizes.Sizes[2] = size_t(MaxZ); + return ReturnValue(ReturnSizes); + } + + case UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE: { + int MaxWorkGroupSize = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &MaxWorkGroupSize, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, + hDevice->get())); + + detail::ur::assertion(MaxWorkGroupSize >= 0); + + return ReturnValue(size_t(MaxWorkGroupSize)); + } + case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF: { + return ReturnValue(0u); + } + case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE: { + return ReturnValue(1u); + } + case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF: { + return ReturnValue(0u); + } + case UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS: { + // Number of sub-groups = max block size / warp size + possible remainder + int MaxThreads = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &MaxThreads, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, + hDevice->get())); + int WarpSize = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, hDevice->get())); + int MaxWarps = (MaxThreads + WarpSize - 1) / WarpSize; + return ReturnValue(MaxWarps); + } + case UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: { + // Volta provides independent thread scheduling + // TODO: Revisit for previous generation GPUs + int Major = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice->get())); + bool IFP = (Major >= 7); + return ReturnValue(IFP); + } + + case UR_DEVICE_INFO_ATOMIC_64: { + int Major = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice->get())); + + bool Atomic64 = (Major >= 6) ? true : false; + return ReturnValue(Atomic64); + } + case UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: { + ur_memory_order_capability_flags_t Capabilities = + UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | + UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE | + UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE | + UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL; + return ReturnValue(Capabilities); + } + case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: { + int Major = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice->get())); + uint64_t Capabilities = + (Major >= 7) ? UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_SYSTEM + : UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE; + return ReturnValue(Capabilities); + } + + case UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: { + // SYCL2020 4.6.4.2 minimum mandated capabilities for + // atomic_fence_order_capabilities. + ur_memory_order_capability_flags_t Capabilities = + UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | + UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE | + UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE | + UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL; + return ReturnValue(Capabilities); + } + case UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: { + // SYCL2020 4.6.4.2 minimum mandated capabilities for + // atomic_fence/memory_scope_capabilities. + // Because scopes are hierarchical, wider scopes support all narrower + // scopes. At a minimum, each device must support WORK_ITEM, SUB_GROUP and + // WORK_GROUP. (https://github.com/KhronosGroup/SYCL-Docs/pull/382) + ur_memory_scope_capability_flags_t Capabilities = + UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP; + return ReturnValue(Capabilities); + } + case UR_DEVICE_INFO_BFLOAT16: { + int Major = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice->get())); + + bool BFloat16 = (Major >= 8) ? true : false; + return ReturnValue(BFloat16); + } + case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: { + // NVIDIA devices only support one sub-group size (the warp size) + int WarpSize = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, hDevice->get())); + size_t Sizes[1] = {static_cast(WarpSize)}; + return ReturnValue(Sizes, 1); + } + case UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY: { + int ClockFreq = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &ClockFreq, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, hDevice->get())); + detail::ur::assertion(ClockFreq >= 0); + return ReturnValue(static_cast(ClockFreq) / 1000u); + } + case UR_DEVICE_INFO_ADDRESS_BITS: { + auto Bits = uint32_t{std::numeric_limits::digits}; + return ReturnValue(Bits); + } + case UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: { + return ReturnValue(uint64_t{hDevice->getMaxAllocSize()}); + } + case UR_DEVICE_INFO_IMAGE_SUPPORTED: { + bool Enabled = false; + + if (std::getenv("SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT") != nullptr || + std::getenv("UR_CUDA_ENABLE_IMAGE_SUPPORT") != nullptr) { + Enabled = true; + } else { + detail::ur::cuPrint( + "Images are not fully supported by the CUDA BE, their support is " + "disabled by default. Their partial support can be activated by " + "setting SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT environment variable at " + "runtime."); + } + + return ReturnValue(Enabled); + } + case UR_DEVICE_INFO_MAX_READ_IMAGE_ARGS: { + // This call doesn't match to CUDA as it doesn't have images, but instead + // surfaces and textures. No clear call in the CUDA API to determine this, + // but some searching found as of SM 2.x 128 are supported. + return ReturnValue(128u); + } + case UR_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS: { + // This call doesn't match to CUDA as it doesn't have images, but instead + // surfaces and textures. No clear call in the CUDA API to determine this, + // but some searching found as of SM 2.x 128 are supported. + return ReturnValue(128u); + } + case UR_DEVICE_INFO_IMAGE2D_MAX_HEIGHT: { + // Take the smaller of maximum surface and maximum texture height. + int TexHeight = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &TexHeight, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, + hDevice->get())); + detail::ur::assertion(TexHeight >= 0); + int SurfHeight = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &SurfHeight, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT, + hDevice->get())); + detail::ur::assertion(SurfHeight >= 0); + + int Min = std::min(TexHeight, SurfHeight); + + return ReturnValue(static_cast(Min)); + } + case UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH: { + // Take the smaller of maximum surface and maximum texture width. + int TexWidth = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &TexWidth, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, + hDevice->get())); + detail::ur::assertion(TexWidth >= 0); + int SurfWidth = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &SurfWidth, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH, + hDevice->get())); + detail::ur::assertion(SurfWidth >= 0); + + int Min = std::min(TexWidth, SurfWidth); + + return ReturnValue(static_cast(Min)); + } + case UR_DEVICE_INFO_IMAGE3D_MAX_HEIGHT: { + // Take the smaller of maximum surface and maximum texture height. + int TexHeight = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &TexHeight, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, + hDevice->get())); + detail::ur::assertion(TexHeight >= 0); + int SurfHeight = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &SurfHeight, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT, + hDevice->get())); + detail::ur::assertion(SurfHeight >= 0); + + int Min = std::min(TexHeight, SurfHeight); + + return ReturnValue(static_cast(Min)); + } + case UR_DEVICE_INFO_IMAGE3D_MAX_WIDTH: { + // Take the smaller of maximum surface and maximum texture width. + int TexWidth = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &TexWidth, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, + hDevice->get())); + detail::ur::assertion(TexWidth >= 0); + int SurfWidth = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &SurfWidth, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH, + hDevice->get())); + detail::ur::assertion(SurfWidth >= 0); + + int Min = std::min(TexWidth, SurfWidth); + + return ReturnValue(static_cast(Min)); + } + case UR_DEVICE_INFO_IMAGE3D_MAX_DEPTH: { + // Take the smaller of maximum surface and maximum texture depth. + int TexDepth = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &TexDepth, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, + hDevice->get())); + detail::ur::assertion(TexDepth >= 0); + int SurfDepth = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &SurfDepth, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH, + hDevice->get())); + detail::ur::assertion(SurfDepth >= 0); + + int Min = std::min(TexDepth, SurfDepth); + + return ReturnValue(static_cast(Min)); + } + case UR_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE: { + // Take the smaller of maximum surface and maximum texture width. + int TexWidth = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &TexWidth, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, + hDevice->get())); + detail::ur::assertion(TexWidth >= 0); + int SurfWidth = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &SurfWidth, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH, + hDevice->get())); + detail::ur::assertion(SurfWidth >= 0); + + int Min = std::min(TexWidth, SurfWidth); + + return ReturnValue(static_cast(Min)); + } + case UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE: { + return ReturnValue(0lu); + } + case UR_DEVICE_INFO_MAX_SAMPLERS: { + // This call is kind of meaningless for cuda, as samplers don't exist. + // Closest thing is textures, which is 128. + return ReturnValue(128u); + } + case UR_DEVICE_INFO_MAX_PARAMETER_SIZE: { + // https://docs.nvidia.com/cuda/cuda-c-programming-guide/#function-parameters + // __global__ function parameters are passed to the device via constant + // memory and are limited to 4 KB. + return ReturnValue(4000lu); + } + case UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: { + int MemBaseAddrAlign = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute(&MemBaseAddrAlign, + CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, + hDevice->get())); + // Multiply by 8 as clGetDeviceInfo returns this value in bits + MemBaseAddrAlign *= 8; + return ReturnValue(MemBaseAddrAlign); + } + case UR_DEVICE_INFO_HALF_FP_CONFIG: { + // TODO: is this config consistent across all NVIDIA GPUs? + return ReturnValue(0u); + } + case UR_DEVICE_INFO_SINGLE_FP_CONFIG: { + // TODO: is this config consistent across all NVIDIA GPUs? + ur_device_fp_capability_flags_t Config = + UR_DEVICE_FP_CAPABILITY_FLAG_DENORM | + UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN | + UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST | + UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO | + UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF | + UR_DEVICE_FP_CAPABILITY_FLAG_FMA | + UR_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT; + return ReturnValue(Config); + } + case UR_DEVICE_INFO_DOUBLE_FP_CONFIG: { + // TODO: is this config consistent across all NVIDIA GPUs? + ur_device_fp_capability_flags_t Config = + UR_DEVICE_FP_CAPABILITY_FLAG_DENORM | + UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN | + UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST | + UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO | + UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF | + UR_DEVICE_FP_CAPABILITY_FLAG_FMA; + return ReturnValue(Config); + } + case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE: { + // TODO: is this config consistent across all NVIDIA GPUs? + return ReturnValue(UR_DEVICE_MEM_CACHE_TYPE_READ_WRITE_CACHE); + } + case UR_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE: { + // The value is documented for all existing GPUs in the CUDA programming + // guidelines, section "H.3.2. Global Memory". + return ReturnValue(128u); + } + case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE: { + int CacheSize = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, hDevice->get())); + detail::ur::assertion(CacheSize >= 0); + // The L2 cache is global to the GPU. + return ReturnValue(static_cast(CacheSize)); + } + case UR_DEVICE_INFO_GLOBAL_MEM_SIZE: { + size_t Bytes = 0; + // Runtime API has easy access to this value, driver API info is scarse. + detail::ur::assertion(cuDeviceTotalMem(&Bytes, hDevice->get()) == + CUDA_SUCCESS); + return ReturnValue(uint64_t{Bytes}); + } + case UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE: { + int ConstantMemory = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &ConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, + hDevice->get())); + detail::ur::assertion(ConstantMemory >= 0); + + return ReturnValue(static_cast(ConstantMemory)); + } + case UR_DEVICE_INFO_MAX_CONSTANT_ARGS: { + // TODO: is there a way to retrieve this from CUDA driver API? + // Hard coded to value returned by clinfo for OpenCL 1.2 CUDA | GeForce GTX + // 1060 3GB + return ReturnValue(9u); + } + case UR_DEVICE_INFO_LOCAL_MEM_TYPE: { + return ReturnValue(UR_DEVICE_LOCAL_MEM_TYPE_LOCAL); + } + case UR_DEVICE_INFO_LOCAL_MEM_SIZE: { + // OpenCL's "local memory" maps most closely to CUDA's "shared memory". + // CUDA has its own definition of "local memory", which maps to OpenCL's + // "private memory". + if (hDevice->maxLocalMemSizeChosen()) { + return ReturnValue( + static_cast(hDevice->getMaxChosenLocalMem())); + } else { + int LocalMemSize = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &LocalMemSize, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, + hDevice->get())); + detail::ur::assertion(LocalMemSize >= 0); + return ReturnValue(static_cast(LocalMemSize)); + } + } + case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: { + int ECCEnabled = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, hDevice->get())); + + detail::ur::assertion((ECCEnabled == 0) | (ECCEnabled == 1)); + auto Result = static_cast(ECCEnabled); + return ReturnValue(Result); + } + case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY: { + int IsIntegrated = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &IsIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, hDevice->get())); + + detail::ur::assertion((IsIntegrated == 0) | (IsIntegrated == 1)); + auto result = static_cast(IsIntegrated); + return ReturnValue(result); + } + case UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION: { + // Hard coded to value returned by clinfo for OpenCL 1.2 CUDA | GeForce GTX + // 1060 3GB + return ReturnValue(1000lu); + } + case UR_DEVICE_INFO_ENDIAN_LITTLE: { + return ReturnValue(true); + } + case UR_DEVICE_INFO_AVAILABLE: { + return ReturnValue(true); + } + case UR_DEVICE_INFO_BUILD_ON_SUBDEVICE: { + return ReturnValue(true); + } + case UR_DEVICE_INFO_COMPILER_AVAILABLE: { + return ReturnValue(true); + } + case UR_DEVICE_INFO_LINKER_AVAILABLE: { + return ReturnValue(true); + } + case UR_DEVICE_INFO_EXECUTION_CAPABILITIES: { + auto Capability = ur_device_exec_capability_flags_t{ + UR_DEVICE_EXEC_CAPABILITY_FLAG_KERNEL}; + return ReturnValue(Capability); + } + case UR_DEVICE_INFO_QUEUE_PROPERTIES: + return ReturnValue( + ur_queue_flag_t(UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE | + UR_QUEUE_FLAG_PROFILING_ENABLE)); + case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES: { + // The mandated minimum capability: + ur_queue_flags_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE | + UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE; + return ReturnValue(Capability); + } + case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: { + // The mandated minimum capability: + ur_queue_flags_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE; + return ReturnValue(Capability); + } + case UR_DEVICE_INFO_BUILT_IN_KERNELS: { + // An empty string is returned if no built-in kernels are supported by the + // device. + return ReturnValue(""); + } + case UR_DEVICE_INFO_PLATFORM: { + return ReturnValue(hDevice->getPlatform()); + } + case UR_DEVICE_INFO_NAME: { + static constexpr size_t MaxDeviceNameLength = 256u; + char Name[MaxDeviceNameLength]; + UR_CHECK_ERROR(cuDeviceGetName(Name, MaxDeviceNameLength, hDevice->get())); + return ReturnValue(Name, strlen(Name) + 1); + } + case UR_DEVICE_INFO_VENDOR: { + return ReturnValue("NVIDIA Corporation"); + } + case UR_DEVICE_INFO_DRIVER_VERSION: { + auto Version = getCudaVersionString(); + return ReturnValue(Version.c_str()); + } + case UR_DEVICE_INFO_PROFILE: { + return ReturnValue("CUDA"); + } + case UR_DEVICE_INFO_REFERENCE_COUNT: { + return ReturnValue(hDevice->getReferenceCount()); + } + case UR_DEVICE_INFO_VERSION: { + std::stringstream SS; + int Major; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice->get())); + SS << Major; + int Minor; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &Minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice->get())); + SS << "." << Minor; + return ReturnValue(SS.str().c_str()); + } + case UR_EXT_DEVICE_INFO_OPENCL_C_VERSION: { + return ReturnValue(""); + } + case UR_DEVICE_INFO_EXTENSIONS: { + + std::string SupportedExtensions = "cl_khr_fp64 cl_khr_subgroups "; + SupportedExtensions += "pi_ext_intel_devicelib_assert "; + SupportedExtensions += " "; + + int Major = 0; + int Minor = 0; + + UR_CHECK_ERROR(cuDeviceGetAttribute( + &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice->get())); + UR_CHECK_ERROR(cuDeviceGetAttribute( + &Minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice->get())); + + if ((Major >= 6) || ((Major == 5) && (Minor >= 3))) { + SupportedExtensions += "cl_khr_fp16 "; + } + + return ReturnValue(SupportedExtensions.c_str()); + } + case UR_DEVICE_INFO_PRINTF_BUFFER_SIZE: { + // The minimum value for the FULL profile is 1 MB. + return ReturnValue(1024lu); + } + case UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC: { + return ReturnValue(true); + } + case UR_DEVICE_INFO_PARENT_DEVICE: { + return ReturnValue(nullptr); + } + case UR_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES: { + return ReturnValue(0u); + } + case UR_DEVICE_INFO_SUPPORTED_PARTITIONS: { + if (pPropSizeRet) { + *pPropSizeRet = 0; + } + return UR_RESULT_SUCCESS; + } + + case UR_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN: { + return ReturnValue(0u); + } + case UR_DEVICE_INFO_PARTITION_TYPE: { + if (pPropSizeRet) { + *pPropSizeRet = 0; + } + return UR_RESULT_SUCCESS; + } + + // Intel USM extensions + + case UR_DEVICE_INFO_USM_HOST_SUPPORT: { + // from cl_intel_unified_shared_memory: "The host memory access capabilities + // apply to any host allocation." + // + // query if/how the device can access page-locked host memory, possibly + // through PCIe, using the same pointer as the host + uint32_t Value = {}; + if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)) { + // the device shares a unified address space with the host + if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= + 6) { + // compute capability 6.x introduces operations that are atomic with + // respect to other CPUs and GPUs in the system + Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; + } else { + // on GPU architectures with compute capability lower than 6.x, atomic + // operations from the GPU to CPU memory will not be atomic with respect + // to CPU initiated atomic operations + Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS; + } + } + return ReturnValue(Value); + } + case UR_DEVICE_INFO_USM_DEVICE_SUPPORT: { + // from cl_intel_unified_shared_memory: + // "The device memory access capabilities apply to any device allocation + // associated with this device." + // + // query how the device can access memory allocated on the device itself (?) + uint32_t Value = + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; + return ReturnValue(Value); + } + case UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT: { + // from cl_intel_unified_shared_memory: + // "The single device shared memory access capabilities apply to any shared + // allocation associated with this device." + // + // query if/how the device can access managed memory associated to it + uint32_t Value = {}; + if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) { + // the device can allocate managed memory on this system + Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS; + } + if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { + // the device can coherently access managed memory concurrently with the + // CPU + Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS; + if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= + 6) { + // compute capability 6.x introduces operations that are atomic with + // respect to other CPUs and GPUs in the system + Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; + } + } + return ReturnValue(Value); + } + case UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT: { + // from cl_intel_unified_shared_memory: + // "The cross-device shared memory access capabilities apply to any shared + // allocation associated with this device, or to any shared memory + // allocation on another device that also supports the same cross-device + // shared memory access capability." + // + // query if/how the device can access managed memory associated to other + // devices + uint32_t Value = {}; + if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) { + // the device can allocate managed memory on this system + Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS; + } + if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { + // all devices with the CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS + // attribute can coherently access managed memory concurrently with the + // CPU + Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS; + } + if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= + 6) { + // compute capability 6.x introduces operations that are atomic with + // respect to other CPUs and GPUs in the system + if (Value & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS) + Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS; + if (Value & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS) + Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; + } + return ReturnValue(Value); + } + case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: { + // from cl_intel_unified_shared_memory: + // "The shared system memory access capabilities apply to any allocations + // made by a system allocator, such as malloc or new." + // + // query if/how the device can access pageable host memory allocated by the + // system allocator + uint32_t Value = {}; + if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS)) { + // the device suppports coherently accessing pageable memory without + // calling cuMemHostRegister/cudaHostRegister on it + if (getAttribute(hDevice, + CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED)) { + // the link between the device and the host supports native atomic + // operations + Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; + } else { + // the link between the device and the host does not support native + // atomic operations + Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS; + } + } + return ReturnValue(Value); + } + case UR_DEVICE_INFO_ASYNC_BARRIER: { + int Value = getAttribute(hDevice, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 8; + return ReturnValue(static_cast(Value)); + } + case UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION: { + int Major = + getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR); + int Minor = + getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR); + std::string Result = std::to_string(Major) + "." + std::to_string(Minor); + return ReturnValue(Result.c_str()); + } + + case UR_DEVICE_INFO_GLOBAL_MEM_FREE: { + size_t FreeMemory = 0; + size_t TotalMemory = 0; + detail::ur::assertion(cuMemGetInfo(&FreeMemory, &TotalMemory) == + CUDA_SUCCESS, + "failed cuMemGetInfo() API."); + return ReturnValue(FreeMemory); + } + case UR_DEVICE_INFO_MEMORY_CLOCK_RATE: { + int Value = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &Value, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, hDevice->get())); + detail::ur::assertion(Value >= 0); + // Convert kilohertz to megahertz when returning. + return ReturnValue(Value / 1000); + } + case UR_DEVICE_INFO_MEMORY_BUS_WIDTH: { + int Value = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &Value, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, hDevice->get())); + detail::ur::assertion(Value >= 0); + return ReturnValue(Value); + } + case UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES: { + return ReturnValue(int32_t{1}); + } + case UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP: { + // On CUDA bindless images are supported. + return ReturnValue(true); + } + case UR_DEVICE_INFO_BINDLESS_IMAGES_SHARED_USM_SUPPORT_EXP: { + // On CUDA bindless images can be backed by shared (managed) USM. + return ReturnValue(true); + } + case UR_DEVICE_INFO_BINDLESS_IMAGES_1D_USM_SUPPORT_EXP: { + // On CUDA 1D bindless image USM is not supported. + // More specifically, linear filtering is not supported. + return ReturnValue(false); + } + case UR_DEVICE_INFO_BINDLESS_IMAGES_2D_USM_SUPPORT_EXP: { + // On CUDA 2D bindless image USM is supported. + return ReturnValue(true); + } + case UR_DEVICE_INFO_IMAGE_PITCH_ALIGN_EXP: { + int32_t tex_pitch_align = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &tex_pitch_align, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, + hDevice->get())); + return ReturnValue(tex_pitch_align); + } + case UR_DEVICE_INFO_MAX_IMAGE_LINEAR_WIDTH_EXP: { + int32_t tex_max_linear_width = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &tex_max_linear_width, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH, hDevice->get())); + return ReturnValue(tex_max_linear_width); + } + case UR_DEVICE_INFO_MAX_IMAGE_LINEAR_HEIGHT_EXP: { + int32_t tex_max_linear_height = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &tex_max_linear_height, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT, hDevice->get())); + return ReturnValue(tex_max_linear_height); + } + case UR_DEVICE_INFO_MAX_IMAGE_LINEAR_PITCH_EXP: { + int32_t tex_max_linear_pitch = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &tex_max_linear_pitch, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH, hDevice->get())); + return ReturnValue(tex_max_linear_pitch); + } + case UR_DEVICE_INFO_MIPMAP_SUPPORT_EXP: { + // CUDA supports mipmaps. + return ReturnValue(true); + } + case UR_DEVICE_INFO_MIPMAP_ANISOTROPY_SUPPORT_EXP: { + // CUDA supports anisotropic filtering. + return ReturnValue(true); + } + case UR_DEVICE_INFO_MIPMAP_MAX_ANISOTROPY_EXP: { + // CUDA has no query for this, but documentation states max value is 16. + return ReturnValue(16.f); + } + case UR_DEVICE_INFO_MIPMAP_LEVEL_REFERENCE_SUPPORT_EXP: { + // CUDA supports creation of images from individual mipmap levels. + return ReturnValue(true); + } + + case UR_DEVICE_INFO_INTEROP_MEMORY_IMPORT_SUPPORT_EXP: { + // CUDA supports importing external memory. + return ReturnValue(true); + } + case UR_DEVICE_INFO_INTEROP_MEMORY_EXPORT_SUPPORT_EXP: { + // CUDA does not support exporting it's own device memory. + return ReturnValue(false); + } + case UR_DEVICE_INFO_INTEROP_SEMAPHORE_IMPORT_SUPPORT_EXP: { + // CUDA supports importing external semaphores. + return ReturnValue(true); + } + case UR_DEVICE_INFO_INTEROP_SEMAPHORE_EXPORT_SUPPORT_EXP: { + // CUDA does not support exporting semaphores or events. + return ReturnValue(false); + } + case UR_DEVICE_INFO_DEVICE_ID: { + int Value = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &Value, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, hDevice->get())); + detail::ur::assertion(Value >= 0); + return ReturnValue(Value); + } + case UR_DEVICE_INFO_UUID: { + CUuuid UUID; +#if (CUDA_VERSION >= 11040) + detail::ur::assertion(cuDeviceGetUuid_v2(&UUID, hDevice->get()) == + CUDA_SUCCESS); +#else + detail::ur::assertion(cuDeviceGetUuid(&UUID, hDevice->get()) == + CUDA_SUCCESS); +#endif + std::array Name; + std::copy(UUID.bytes, UUID.bytes + 16, Name.begin()); + return ReturnValue(Name.data(), 16); + } + case UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH: { + int Major = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice->get())); + + int Minor = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &Minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice->get())); + + // Some specific devices seem to need special handling. See reference + // https://github.com/jeffhammond/HPCInfo/blob/master/cuda/gpu-detect.cu + bool IsXavierAGX = Major == 7 && Minor == 2; + bool IsOrinAGX = Major == 8 && Minor == 7; + + int MemoryClockKHz = 0; + if (IsXavierAGX) { + MemoryClockKHz = 2133000; + } else if (IsOrinAGX) { + MemoryClockKHz = 3200000; + } else { + UR_CHECK_ERROR(cuDeviceGetAttribute(&MemoryClockKHz, + CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, + hDevice->get())); + } + + int MemoryBusWidth = 0; + if (IsOrinAGX) { + MemoryBusWidth = 256; + } else { + UR_CHECK_ERROR(cuDeviceGetAttribute( + &MemoryBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, + hDevice->get())); + } + + uint32_t MemoryBandwidth = MemoryClockKHz * MemoryBusWidth * 250; + + return ReturnValue(MemoryBandwidth); + } + case UR_DEVICE_INFO_IL_VERSION: { + std::string ILVersion = "nvptx-"; + + int DriverVersion = 0; + cuDriverGetVersion(&DriverVersion); + int Major = DriverVersion / 1000; + int Minor = DriverVersion % 1000 / 10; + + // We can work out which ptx ISA version we support based on the versioning + // table published here + // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release-notes + // Major versions that we support are consistent in how they line up, so we + // can derive that easily. The minor versions for version 10 don't line up + // the same so it needs a special case. This is not ideal but it does seem + // to be the best bet to avoid a maintenance burden here. + ILVersion += std::to_string(Major - 4) + "."; + if (Major == 10) { + ILVersion += std::to_string(Minor + 3); + } else if (Major >= 11) { + ILVersion += std::to_string(Minor); + } else { + return UR_RESULT_ERROR_INVALID_VALUE; + } + + return ReturnValue(ILVersion.data(), ILVersion.size()); + } + case UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP: { + // Maximum number of 32-bit registers available to a thread block. + // Note: This number is shared by all thread blocks simultaneously resident + // on a multiprocessor. + int MaxRegisters{-1}; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &MaxRegisters, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, + hDevice->get())); + + detail::ur::assertion(MaxRegisters >= 0); + + return ReturnValue(static_cast(MaxRegisters)); + } + case UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT: + return ReturnValue(false); + case UR_DEVICE_INFO_IMAGE_SRGB: + return ReturnValue(false); + case UR_DEVICE_INFO_PCI_ADDRESS: { + constexpr size_t AddressBufferSize = 13; + char AddressBuffer[AddressBufferSize]; + UR_CHECK_ERROR( + cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, hDevice->get())); + // CUDA API (8.x - 12.1) guarantees 12 bytes + \0 are written + detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) == 12); + return ReturnValue(AddressBuffer, + strnlen(AddressBuffer, AddressBufferSize - 1) + 1); + } + case UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS: + return ReturnValue(false); + // TODO: Investigate if this information is available on CUDA. + case UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED: + return ReturnValue(false); + case UR_DEVICE_INFO_ESIMD_SUPPORT: + return ReturnValue(false); + case UR_DEVICE_INFO_MAX_READ_WRITE_IMAGE_ARGS: + case UR_DEVICE_INFO_GPU_EU_COUNT: + case UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH: + case UR_DEVICE_INFO_GPU_EU_SLICES: + case UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE: + case UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE: + case UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU: + case UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT: + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + + default: + break; + } + return UR_RESULT_ERROR_INVALID_ENUMERATION; +} + +/// \return PI_SUCCESS if the function is executed successfully +/// CUDA devices are always root devices so retain always returns success. +UR_APIEXPORT ur_result_t UR_APICALL urDeviceRetain(ur_device_handle_t hDevice) { + std::ignore = hDevice; + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urDevicePartition(ur_device_handle_t, const ur_device_partition_properties_t *, + uint32_t, ur_device_handle_t *, uint32_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +/// \return UR_RESULT_SUCCESS always since CUDA devices are always root +/// devices. +UR_APIEXPORT ur_result_t UR_APICALL +urDeviceRelease(ur_device_handle_t hDevice) { + std::ignore = hDevice; + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(ur_platform_handle_t hPlatform, + ur_device_type_t DeviceType, + uint32_t NumEntries, + ur_device_handle_t *phDevices, + uint32_t *pNumDevices) { + ur_result_t Result = UR_RESULT_SUCCESS; + const bool AskingForAll = DeviceType == UR_DEVICE_TYPE_ALL; + const bool AskingForDefault = DeviceType == UR_DEVICE_TYPE_DEFAULT; + const bool AskingForGPU = DeviceType == UR_DEVICE_TYPE_GPU; + const bool ReturnDevices = AskingForDefault || AskingForAll || AskingForGPU; + + size_t NumDevices = ReturnDevices ? hPlatform->Devices.size() : 0; + + try { + if (pNumDevices) { + *pNumDevices = NumDevices; + } + + if (ReturnDevices && phDevices) { + for (size_t i = 0; i < std::min(size_t(NumEntries), NumDevices); ++i) { + phDevices[i] = hPlatform->Devices[i].get(); + } + } + + return Result; + } catch (ur_result_t Err) { + return Err; + } catch (...) { + return UR_RESULT_ERROR_OUT_OF_RESOURCES; + } +} + +/// Gets the native CUDA handle of a UR device object +/// +/// \param[in] device The UR device to get the native CUDA object of. +/// \param[out] nativeHandle Set to the native handle of the UR device object. +/// +/// \return PI_SUCCESS + +UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle( + ur_device_handle_t hDevice, ur_native_handle_t *phNativeHandle) { + *phNativeHandle = reinterpret_cast(hDevice->get()); + return UR_RESULT_SUCCESS; +} + +/// Created a UR device object from a CUDA device handle. +/// NOTE: The created UR object does not take ownership of the native handle. +/// +/// \param[in] nativeHandle The native handle to create UR device object from. +/// \param[in] platform is the UR platform of the device. +/// \param[out] device Set to the UR device object created from native handle. +/// +/// \return TBD + +UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( + ur_native_handle_t hNativeDevice, ur_platform_handle_t hPlatform, + const ur_device_native_properties_t *pProperties, + ur_device_handle_t *phDevice) { + std::ignore = pProperties; + + // We can't cast between ur_native_handle_t and CUdevice, so memcpy the bits + // instead + CUdevice CuDevice = 0; + memcpy(&CuDevice, &hNativeDevice, sizeof(CUdevice)); + + auto IsDevice = [=](std::unique_ptr &Dev) { + return Dev->get() == CuDevice; + }; + + // If a platform is provided just check if the device is in it + if (hPlatform) { + auto SearchRes = std::find_if(begin(hPlatform->Devices), + end(hPlatform->Devices), IsDevice); + if (SearchRes != end(hPlatform->Devices)) { + *phDevice = SearchRes->get(); + return UR_RESULT_SUCCESS; + } + } + + // Get list of platforms + uint32_t NumPlatforms = 0; + ur_adapter_handle_t AdapterHandle = &adapter; + ur_result_t Result = + urPlatformGet(&AdapterHandle, 1, 0, nullptr, &NumPlatforms); + if (Result != UR_RESULT_SUCCESS) + return Result; + + ur_platform_handle_t *Plat = static_cast( + malloc(NumPlatforms * sizeof(ur_platform_handle_t))); + Result = urPlatformGet(&AdapterHandle, 1, NumPlatforms, Plat, nullptr); + if (Result != UR_RESULT_SUCCESS) + return Result; + + // Iterate through platforms to find device that matches nativeHandle + for (uint32_t j = 0; j < NumPlatforms; ++j) { + auto SearchRes = + std::find_if(begin(Plat[j]->Devices), end(Plat[j]->Devices), IsDevice); + if (SearchRes != end(Plat[j]->Devices)) { + *phDevice = static_cast((*SearchRes).get()); + return UR_RESULT_SUCCESS; + } + } + + // If the provided nativeHandle cannot be matched to an + // existing device return error + return UR_RESULT_ERROR_INVALID_OPERATION; +} + +ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice, + uint64_t *pDeviceTimestamp, + uint64_t *pHostTimestamp) { + CUevent Event; + ScopedContext Active(hDevice->getContext()); + + if (pDeviceTimestamp) { + UR_CHECK_ERROR(cuEventCreate(&Event, CU_EVENT_DEFAULT)); + UR_CHECK_ERROR(cuEventRecord(Event, 0)); + } + if (pHostTimestamp) { + + using namespace std::chrono; + *pHostTimestamp = + duration_cast(steady_clock::now().time_since_epoch()) + .count(); + } + + if (pDeviceTimestamp) { + UR_CHECK_ERROR(cuEventSynchronize(Event)); + *pDeviceTimestamp = hDevice->getElapsedTime(Event); + } + + return UR_RESULT_SUCCESS; +} + +/// \return If available, the first binary that is PTX +/// +UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary( + ur_device_handle_t hDevice, const ur_device_binary_t *pBinaries, + uint32_t NumBinaries, uint32_t *pSelectedBinary) { + std::ignore = hDevice; + + // Look for an image for the NVPTX64 target, and return the first one that is + // found + for (uint32_t i = 0; i < NumBinaries; i++) { + if (strcmp(pBinaries[i].pDeviceTargetSpec, + UR_DEVICE_BINARY_TARGET_NVPTX64) == 0) { + *pSelectedBinary = i; + return UR_RESULT_SUCCESS; + } + } + + // No image can be loaded for the given device + return UR_RESULT_ERROR_INVALID_BINARY; +} diff --git a/source/adapters/cuda/device.hpp b/source/adapters/cuda/device.hpp new file mode 100644 index 0000000000..696630bd10 --- /dev/null +++ b/source/adapters/cuda/device.hpp @@ -0,0 +1,119 @@ +//===--------- device.hpp - CUDA Adapter ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#pragma once + +#include + +struct ur_device_handle_t_ { +private: + using native_type = CUdevice; + + native_type CuDevice; + CUcontext CuContext; + CUevent EvBase; // CUDA event used as base counter + std::atomic_uint32_t RefCount; + ur_platform_handle_t Platform; + + static constexpr uint32_t MaxWorkItemDimensions = 3u; + size_t MaxWorkItemSizes[MaxWorkItemDimensions]; + size_t MaxWorkGroupSize{0}; + size_t MaxAllocSize{0}; + int MaxBlockDimY{0}; + int MaxBlockDimZ{0}; + int MaxRegsPerBlock{0}; + int MaxCapacityLocalMem{0}; + int MaxChosenLocalMem{0}; + bool MaxLocalMemSizeChosen{false}; + +public: + ur_device_handle_t_(native_type cuDevice, CUcontext cuContext, CUevent evBase, + ur_platform_handle_t platform) + : CuDevice(cuDevice), CuContext(cuContext), EvBase(evBase), RefCount{1}, + Platform(platform) { + + UR_CHECK_ERROR(cuDeviceGetAttribute( + &MaxBlockDimY, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, cuDevice)); + UR_CHECK_ERROR(cuDeviceGetAttribute( + &MaxBlockDimZ, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, cuDevice)); + UR_CHECK_ERROR(cuDeviceGetAttribute( + &MaxRegsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, + cuDevice)); + + // Set local mem max size if env var is present + static const char *LocalMemSizePtrUR = + std::getenv("UR_CUDA_MAX_LOCAL_MEM_SIZE"); + static const char *LocalMemSizePtrPI = + std::getenv("SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE"); + static const char *LocalMemSizePtr = + LocalMemSizePtrUR ? LocalMemSizePtrUR + : (LocalMemSizePtrPI ? LocalMemSizePtrPI : nullptr); + + if (LocalMemSizePtr) { + cuDeviceGetAttribute( + &MaxCapacityLocalMem, + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, cuDevice); + MaxChosenLocalMem = std::atoi(LocalMemSizePtr); + MaxLocalMemSizeChosen = true; + } + + // Max size of memory object allocation in bytes. + // The minimum value is max(min(1024 × 1024 × + // 1024, 1/4th of CL_DEVICE_GLOBAL_MEM_SIZE), + // 32 × 1024 × 1024) for devices that are not of type + // CL_DEVICE_TYPE_CUSTOM. + size_t Global = 0; + UR_CHECK_ERROR(cuDeviceTotalMem(&Global, cuDevice)); + + auto QuarterGlobal = static_cast(Global / 4u); + + MaxAllocSize = std::max(std::min(1024u * 1024u * 1024u, QuarterGlobal), + 32u * 1024u * 1024u); + } + + ~ur_device_handle_t_() { cuDevicePrimaryCtxRelease(CuDevice); } + + native_type get() const noexcept { return CuDevice; }; + + CUcontext getContext() const noexcept { return CuContext; }; + + uint32_t getReferenceCount() const noexcept { return RefCount; } + + ur_platform_handle_t getPlatform() const noexcept { return Platform; }; + + uint64_t getElapsedTime(CUevent) const; + + void saveMaxWorkItemSizes(size_t Size, + size_t *SaveMaxWorkItemSizes) noexcept { + memcpy(MaxWorkItemSizes, SaveMaxWorkItemSizes, Size); + }; + + void saveMaxWorkGroupSize(int Value) noexcept { MaxWorkGroupSize = Value; }; + + void getMaxWorkItemSizes(size_t RetSize, + size_t *RetMaxWorkItemSizes) const noexcept { + memcpy(RetMaxWorkItemSizes, MaxWorkItemSizes, RetSize); + }; + + size_t getMaxWorkGroupSize() const noexcept { return MaxWorkGroupSize; }; + + size_t getMaxBlockDimY() const noexcept { return MaxBlockDimY; }; + + size_t getMaxBlockDimZ() const noexcept { return MaxBlockDimZ; }; + + size_t getMaxRegsPerBlock() const noexcept { return MaxRegsPerBlock; }; + + size_t getMaxAllocSize() const noexcept { return MaxAllocSize; }; + + int getMaxCapacityLocalMem() const noexcept { return MaxCapacityLocalMem; }; + + int getMaxChosenLocalMem() const noexcept { return MaxChosenLocalMem; }; + + bool maxLocalMemSizeChosen() { return MaxLocalMemSizeChosen; }; +}; + +int getAttribute(ur_device_handle_t Device, CUdevice_attribute Attribute); diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp new file mode 100644 index 0000000000..ec1adce808 --- /dev/null +++ b/source/adapters/cuda/enqueue.cpp @@ -0,0 +1,1690 @@ +//===--------- enqueue.cpp - CUDA Adapter ---------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "enqueue.hpp" +#include "common.hpp" +#include "context.hpp" +#include "event.hpp" +#include "kernel.hpp" +#include "memory.hpp" +#include "queue.hpp" + +#include +#include + +ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream, + uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList) { + UR_ASSERT(EventWaitList, UR_RESULT_SUCCESS); + + try { + ScopedContext Active(CommandQueue->getContext()); + + auto Result = forLatestEvents( + EventWaitList, NumEventsInWaitList, + [Stream](ur_event_handle_t Event) -> ur_result_t { + if (Event->getStream() == Stream) { + return UR_RESULT_SUCCESS; + } else { + UR_CHECK_ERROR(cuStreamWaitEvent(Stream, Event->get(), 0)); + return UR_RESULT_SUCCESS; + } + }); + return Result; + } catch (ur_result_t Err) { + return Err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } +} + +template +void getUSMHostOrDevicePtr(PtrT USMPtr, CUmemorytype *OutMemType, + CUdeviceptr *OutDevPtr, PtrT *OutHostPtr) { + // do not throw if cuPointerGetAttribute returns CUDA_ERROR_INVALID_VALUE + // checks with PI_CHECK_ERROR are not suggested + CUresult Ret = cuPointerGetAttribute( + OutMemType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)USMPtr); + // ARRAY, UNIFIED types are not supported! + assert(*OutMemType != CU_MEMORYTYPE_ARRAY && + *OutMemType != CU_MEMORYTYPE_UNIFIED); + + // pointer not known to the CUDA subsystem (possibly a system allocated ptr) + if (Ret == CUDA_ERROR_INVALID_VALUE) { + *OutMemType = CU_MEMORYTYPE_HOST; + *OutDevPtr = 0; + *OutHostPtr = USMPtr; + + // todo: resets the above "non-stick" error + } else if (Ret == CUDA_SUCCESS) { + *OutDevPtr = (*OutMemType == CU_MEMORYTYPE_DEVICE) + ? reinterpret_cast(USMPtr) + : 0; + *OutHostPtr = (*OutMemType == CU_MEMORYTYPE_HOST) ? USMPtr : nullptr; + } else { + UR_CHECK_ERROR(Ret); + } +} + +ur_result_t setCuMemAdvise(CUdeviceptr DevPtr, size_t Size, + ur_usm_advice_flags_t URAdviceFlags, + CUdevice Device) { + std::unordered_map + URToCUMemAdviseDeviceFlagsMap = { + {UR_USM_ADVICE_FLAG_SET_READ_MOSTLY, CU_MEM_ADVISE_SET_READ_MOSTLY}, + {UR_USM_ADVICE_FLAG_CLEAR_READ_MOSTLY, + CU_MEM_ADVISE_UNSET_READ_MOSTLY}, + {UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION, + CU_MEM_ADVISE_SET_PREFERRED_LOCATION}, + {UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION, + CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION}, + {UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE, + CU_MEM_ADVISE_SET_ACCESSED_BY}, + {UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_DEVICE, + CU_MEM_ADVISE_UNSET_ACCESSED_BY}, + }; + for (auto &FlagPair : URToCUMemAdviseDeviceFlagsMap) { + if (URAdviceFlags & FlagPair.first) { + UR_CHECK_ERROR(cuMemAdvise(DevPtr, Size, FlagPair.second, Device)); + } + } + + std::unordered_map + URToCUMemAdviseHostFlagsMap = { + {UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION_HOST, + CU_MEM_ADVISE_SET_PREFERRED_LOCATION}, + {UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION_HOST, + CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION}, + {UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_HOST, + CU_MEM_ADVISE_SET_ACCESSED_BY}, + {UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_HOST, + CU_MEM_ADVISE_UNSET_ACCESSED_BY}, + }; + + for (auto &FlagPair : URToCUMemAdviseHostFlagsMap) { + if (URAdviceFlags & FlagPair.first) { + UR_CHECK_ERROR(cuMemAdvise(DevPtr, Size, FlagPair.second, CU_DEVICE_CPU)); + } + } + + std::array UnmappedMemAdviceFlags = { + UR_USM_ADVICE_FLAG_SET_NON_ATOMIC_MOSTLY, + UR_USM_ADVICE_FLAG_CLEAR_NON_ATOMIC_MOSTLY, + UR_USM_ADVICE_FLAG_BIAS_CACHED, UR_USM_ADVICE_FLAG_BIAS_UNCACHED}; + + for (auto &UnmappedFlag : UnmappedMemAdviceFlags) { + if (URAdviceFlags & UnmappedFlag) { + throw UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + + return UR_RESULT_SUCCESS; +} + +// Determine local work sizes that result in uniform work groups. +// The default threadsPerBlock only require handling the first work_dim +// dimension. +void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock, + const size_t *GlobalWorkSize, const uint32_t WorkDim, + const size_t MaxThreadsPerBlock[3], + ur_kernel_handle_t Kernel, uint32_t LocalSize) { + assert(ThreadsPerBlock != nullptr); + assert(GlobalWorkSize != nullptr); + assert(Kernel != nullptr); + int MinGrid, MaxBlockSize; + size_t MaxBlockDim[3]; + + // The below assumes a three dimensional range but this is not guaranteed by + // UR. + size_t GlobalSizeNormalized[3] = {1, 1, 1}; + for (uint32_t i = 0; i < WorkDim; i++) { + GlobalSizeNormalized[i] = GlobalWorkSize[i]; + } + + MaxBlockDim[1] = Device->getMaxBlockDimY(); + MaxBlockDim[2] = Device->getMaxBlockDimZ(); + + UR_CHECK_ERROR( + cuOccupancyMaxPotentialBlockSize(&MinGrid, &MaxBlockSize, Kernel->get(), + NULL, LocalSize, MaxThreadsPerBlock[0])); + + ThreadsPerBlock[2] = std::min(GlobalSizeNormalized[2], MaxBlockDim[2]); + ThreadsPerBlock[1] = + std::min(GlobalSizeNormalized[1], + std::min(MaxBlockSize / ThreadsPerBlock[2], MaxBlockDim[1])); + MaxBlockDim[0] = MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[2]); + ThreadsPerBlock[0] = std::min( + MaxThreadsPerBlock[0], std::min(GlobalSizeNormalized[0], MaxBlockDim[0])); + + static auto IsPowerOf2 = [](size_t Value) -> bool { + return Value && !(Value & (Value - 1)); + }; + + // Find a local work group size that is a divisor of the global + // work group size to produce uniform work groups. + // Additionally, for best compute utilisation, the local size has + // to be a power of two. + while (0u != (GlobalSizeNormalized[0] % ThreadsPerBlock[0]) || + !IsPowerOf2(ThreadsPerBlock[0])) { + --ThreadsPerBlock[0]; + } +} + +// Helper to verify out-of-registers case (exceeded block max registers). +// If the kernel requires a number of registers for the entire thread +// block exceeds the hardware limitations, then the cuLaunchKernel call +// will fail to launch with CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES error. +bool hasExceededMaxRegistersPerBlock(ur_device_handle_t Device, + ur_kernel_handle_t Kernel, + size_t BlockSize) { + return BlockSize * Kernel->getRegsPerThread() > Device->getMaxRegsPerBlock(); +} + +/// Enqueues a wait on the given CUstream for all specified events (See +/// \ref enqueueEventWaitWithBarrier.) If the events list is empty, the enqueued +/// wait will wait on all previous events in the queue. +/// +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( + ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + // This function makes one stream work on the previous work (or work + // represented by input events) and then all future work waits on that stream. + try { + ScopedContext Active(hQueue->getContext()); + uint32_t StreamToken; + ur_stream_guard_ Guard; + CUstream CuStream = hQueue->getNextComputeStream( + numEventsInWaitList, phEventWaitList, Guard, &StreamToken); + { + std::lock_guard GuardBarrier(hQueue->BarrierMutex); + if (hQueue->BarrierEvent == nullptr) { + UR_CHECK_ERROR( + cuEventCreate(&hQueue->BarrierEvent, CU_EVENT_DISABLE_TIMING)); + } + if (numEventsInWaitList == 0) { // wait on all work + if (hQueue->BarrierTmpEvent == nullptr) { + UR_CHECK_ERROR( + cuEventCreate(&hQueue->BarrierTmpEvent, CU_EVENT_DISABLE_TIMING)); + } + hQueue->syncStreams( + [CuStream, TmpEvent = hQueue->BarrierTmpEvent](CUstream s) { + if (CuStream != s) { + // record a new CUDA event on every stream and make one stream + // wait for these events + UR_CHECK_ERROR(cuEventRecord(TmpEvent, s)); + UR_CHECK_ERROR(cuStreamWaitEvent(CuStream, TmpEvent, 0)); + } + }); + } else { // wait just on given events + forLatestEvents(phEventWaitList, numEventsInWaitList, + [CuStream](ur_event_handle_t Event) -> ur_result_t { + if (Event->getQueue()->hasBeenSynchronized( + Event->getComputeStreamToken())) { + return UR_RESULT_SUCCESS; + } else { + UR_CHECK_ERROR( + cuStreamWaitEvent(CuStream, Event->get(), 0)); + return UR_RESULT_SUCCESS; + } + }); + } + + UR_CHECK_ERROR(cuEventRecord(hQueue->BarrierEvent, CuStream)); + for (unsigned int i = 0; i < hQueue->ComputeAppliedBarrier.size(); i++) { + hQueue->ComputeAppliedBarrier[i] = false; + } + for (unsigned int i = 0; i < hQueue->TransferAppliedBarrier.size(); i++) { + hQueue->TransferAppliedBarrier[i] = false; + } + } + + if (phEvent) { + *phEvent = ur_event_handle_t_::makeNative( + UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, hQueue, CuStream, StreamToken); + UR_CHECK_ERROR((*phEvent)->start()); + UR_CHECK_ERROR((*phEvent)->record()); + } + + return UR_RESULT_SUCCESS; + } catch (ur_result_t Err) { + return Err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } +} + +/// Enqueues a wait on the given CUstream for all events. +/// See \ref enqueueEventWait +/// TODO: Add support for multiple streams once the Event class is properly +/// refactored. +/// +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( + ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList, + phEventWaitList, phEvent); +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + // Preconditions + UR_ASSERT(hQueue->getContext() == hKernel->getContext(), + UR_RESULT_ERROR_INVALID_KERNEL); + UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + + if (*pGlobalWorkSize == 0) { + return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList, + phEventWaitList, phEvent); + } + + // Set the number of threads per block to the number of threads per warp + // by default unless user has provided a better number + size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; + size_t MaxWorkGroupSize = 0u; + size_t MaxThreadsPerBlock[3] = {}; + bool ProvidedLocalWorkGroupSize = (pLocalWorkSize != nullptr); + uint32_t LocalSize = hKernel->getLocalSize(); + ur_result_t Result = UR_RESULT_SUCCESS; + + try { + // Set the active context here as guessLocalWorkSize needs an active context + ScopedContext Active(hQueue->getContext()); + { + size_t *ReqdThreadsPerBlock = hKernel->ReqdThreadsPerBlock; + MaxWorkGroupSize = hQueue->Device->getMaxWorkGroupSize(); + hQueue->Device->getMaxWorkItemSizes(sizeof(MaxThreadsPerBlock), + MaxThreadsPerBlock); + + if (ProvidedLocalWorkGroupSize) { + auto IsValid = [&](int Dim) { + if (ReqdThreadsPerBlock[Dim] != 0 && + pLocalWorkSize[Dim] != ReqdThreadsPerBlock[Dim]) + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + + if (pLocalWorkSize[Dim] > MaxThreadsPerBlock[Dim]) + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + // Checks that local work sizes are a divisor of the global work sizes + // which includes that the local work sizes are neither larger than + // the global work sizes and not 0. + if (0u == pLocalWorkSize[Dim]) + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + if (0u != (pGlobalWorkSize[Dim] % pLocalWorkSize[Dim])) + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + ThreadsPerBlock[Dim] = pLocalWorkSize[Dim]; + return UR_RESULT_SUCCESS; + }; + + size_t KernelLocalWorkGroupSize = 0; + for (size_t Dim = 0; Dim < workDim; Dim++) { + auto Err = IsValid(Dim); + if (Err != UR_RESULT_SUCCESS) + return Err; + // If no error then sum the total local work size per dim. + KernelLocalWorkGroupSize += pLocalWorkSize[Dim]; + } + + if (hasExceededMaxRegistersPerBlock(hQueue->Device, hKernel, + KernelLocalWorkGroupSize)) { + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + } else { + guessLocalWorkSize(hQueue->Device, ThreadsPerBlock, pGlobalWorkSize, + workDim, MaxThreadsPerBlock, hKernel, LocalSize); + } + } + + if (MaxWorkGroupSize < + ThreadsPerBlock[0] * ThreadsPerBlock[1] * ThreadsPerBlock[2]) { + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + + size_t BlocksPerGrid[3] = {1u, 1u, 1u}; + + for (size_t i = 0; i < workDim; i++) { + BlocksPerGrid[i] = + (pGlobalWorkSize[i] + ThreadsPerBlock[i] - 1) / ThreadsPerBlock[i]; + } + + std::unique_ptr RetImplEvent{nullptr}; + + uint32_t StreamToken; + ur_stream_guard_ Guard; + CUstream CuStream = hQueue->getNextComputeStream( + numEventsInWaitList, phEventWaitList, Guard, &StreamToken); + CUfunction CuFunc = hKernel->get(); + + Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, + phEventWaitList); + + // Set the implicit global offset parameter if kernel has offset variant + if (hKernel->get_with_offset_parameter()) { + std::uint32_t CudaImplicitOffset[3] = {0, 0, 0}; + if (pGlobalWorkOffset) { + for (size_t i = 0; i < workDim; i++) { + CudaImplicitOffset[i] = + static_cast(pGlobalWorkOffset[i]); + if (pGlobalWorkOffset[i] != 0) { + CuFunc = hKernel->get_with_offset_parameter(); + } + } + } + hKernel->setImplicitOffsetArg(sizeof(CudaImplicitOffset), + CudaImplicitOffset); + } + + auto &ArgIndices = hKernel->getArgIndices(); + + if (phEvent) { + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_KERNEL_LAUNCH, hQueue, CuStream, StreamToken)); + UR_CHECK_ERROR(RetImplEvent->start()); + } + + if (hQueue->getContext()->getDevice()->maxLocalMemSizeChosen()) { + // Set up local memory requirements for kernel. + auto Device = hQueue->getContext()->getDevice(); + if (Device->getMaxChosenLocalMem() < 0) { + bool EnvVarHasURPrefix = + (std::getenv("UR_CUDA_MAX_LOCAL_MEM_SIZE") != nullptr); + setErrorMessage(EnvVarHasURPrefix ? "Invalid value specified for " + "UR_CUDA_MAX_LOCAL_MEM_SIZE" + : "Invalid value specified for " + "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE", + UR_RESULT_ERROR_ADAPTER_SPECIFIC); + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } + if (LocalSize > static_cast(Device->getMaxCapacityLocalMem())) { + setErrorMessage("Too much local memory allocated for device", + UR_RESULT_ERROR_ADAPTER_SPECIFIC); + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } + if (LocalSize > static_cast(Device->getMaxChosenLocalMem())) { + bool EnvVarHasURPrefix = + (std::getenv("UR_CUDA_MAX_LOCAL_MEM_SIZE") != nullptr); + setErrorMessage( + EnvVarHasURPrefix + ? "Local memory for kernel exceeds the amount requested using " + "UR_CUDA_MAX_LOCAL_MEM_SIZE. Try increasing the value of " + "UR_CUDA_MAX_LOCAL_MEM_SIZE." + : "Local memory for kernel exceeds the amount requested using " + "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE. Try increasing the the " + "value of SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE.", + UR_RESULT_ERROR_ADAPTER_SPECIFIC); + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } + UR_CHECK_ERROR(cuFuncSetAttribute( + CuFunc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, + Device->getMaxChosenLocalMem())); + } + + UR_CHECK_ERROR(cuLaunchKernel( + CuFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2], + ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], LocalSize, + CuStream, const_cast(ArgIndices.data()), nullptr)); + if (LocalSize != 0) + hKernel->clearLocalSize(); + + if (phEvent) { + UR_CHECK_ERROR(RetImplEvent->record()); + *phEvent = RetImplEvent.release(); + } + } catch (ur_result_t Err) { + Result = Err; + } + return Result; +} + +/// General 3D memory copy operation. +/// This function requires the corresponding CUDA context to be at the top of +/// the context stack +/// If the source and/or destination is on the device, SrcPtr and/or DstPtr +/// must be a pointer to a CUdeviceptr +static ur_result_t commonEnqueueMemBufferCopyRect( + CUstream cu_stream, ur_rect_region_t region, const void *SrcPtr, + const CUmemorytype_enum SrcType, ur_rect_offset_t src_offset, + size_t src_row_pitch, size_t src_slice_pitch, void *DstPtr, + const CUmemorytype_enum DstType, ur_rect_offset_t dst_offset, + size_t dst_row_pitch, size_t dst_slice_pitch) { + + UR_ASSERT(SrcType == CU_MEMORYTYPE_DEVICE || SrcType == CU_MEMORYTYPE_HOST, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(DstType == CU_MEMORYTYPE_DEVICE || DstType == CU_MEMORYTYPE_HOST, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + src_row_pitch = + (!src_row_pitch) ? region.width + src_offset.x : src_row_pitch; + src_slice_pitch = (!src_slice_pitch) + ? ((region.height + src_offset.y) * src_row_pitch) + : src_slice_pitch; + dst_row_pitch = + (!dst_row_pitch) ? region.width + dst_offset.x : dst_row_pitch; + dst_slice_pitch = (!dst_slice_pitch) + ? ((region.height + dst_offset.y) * dst_row_pitch) + : dst_slice_pitch; + + CUDA_MEMCPY3D params = {}; + + params.WidthInBytes = region.width; + params.Height = region.height; + params.Depth = region.depth; + + params.srcMemoryType = SrcType; + params.srcDevice = SrcType == CU_MEMORYTYPE_DEVICE + ? *static_cast(SrcPtr) + : 0; + params.srcHost = SrcType == CU_MEMORYTYPE_HOST ? SrcPtr : nullptr; + params.srcXInBytes = src_offset.x; + params.srcY = src_offset.y; + params.srcZ = src_offset.z; + params.srcPitch = src_row_pitch; + params.srcHeight = src_slice_pitch / src_row_pitch; + + params.dstMemoryType = DstType; + params.dstDevice = + DstType == CU_MEMORYTYPE_DEVICE ? *static_cast(DstPtr) : 0; + params.dstHost = DstType == CU_MEMORYTYPE_HOST ? DstPtr : nullptr; + params.dstXInBytes = dst_offset.x; + params.dstY = dst_offset.y; + params.dstZ = dst_offset.z; + params.dstPitch = dst_row_pitch; + params.dstHeight = dst_slice_pitch / dst_row_pitch; + + UR_CHECK_ERROR(cuMemcpy3DAsync(¶ms, cu_stream)); + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect( + ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingRead, + ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, + ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, + size_t hostRowPitch, size_t hostSlicePitch, void *pDst, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + ur_result_t Result = UR_RESULT_SUCCESS; + CUdeviceptr DevPtr = hBuffer->Mem.BufferMem.get(); + std::unique_ptr RetImplEvent{nullptr}; + + try { + ScopedContext Active(hQueue->getContext()); + CUstream CuStream = hQueue->getNextTransferStream(); + + Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, + phEventWaitList); + + if (phEvent) { + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_BUFFER_READ_RECT, hQueue, CuStream)); + UR_CHECK_ERROR(RetImplEvent->start()); + } + + Result = commonEnqueueMemBufferCopyRect( + CuStream, region, &DevPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin, + bufferRowPitch, bufferSlicePitch, pDst, CU_MEMORYTYPE_HOST, hostOrigin, + hostRowPitch, hostSlicePitch); + + if (phEvent) { + UR_CHECK_ERROR(RetImplEvent->record()); + } + + if (blockingRead) { + UR_CHECK_ERROR(cuStreamSynchronize(CuStream)); + } + + if (phEvent) { + *phEvent = RetImplEvent.release(); + } + + } catch (ur_result_t Err) { + Result = Err; + } + return Result; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( + ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingWrite, + ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, + ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, + size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + ur_result_t Result = UR_RESULT_SUCCESS; + CUdeviceptr DevPtr = hBuffer->Mem.BufferMem.get(); + std::unique_ptr RetImplEvent{nullptr}; + + try { + ScopedContext active(hQueue->getContext()); + CUstream cuStream = hQueue->getNextTransferStream(); + Result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, + phEventWaitList); + + if (phEvent) { + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_BUFFER_WRITE_RECT, hQueue, cuStream)); + UR_CHECK_ERROR(RetImplEvent->start()); + } + + Result = commonEnqueueMemBufferCopyRect( + cuStream, region, pSrc, CU_MEMORYTYPE_HOST, hostOrigin, hostRowPitch, + hostSlicePitch, &DevPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin, + bufferRowPitch, bufferSlicePitch); + + if (phEvent) { + UR_CHECK_ERROR(RetImplEvent->record()); + } + + if (blockingWrite) { + UR_CHECK_ERROR(cuStreamSynchronize(cuStream)); + } + + if (phEvent) { + *phEvent = RetImplEvent.release(); + } + + } catch (ur_result_t Err) { + Result = Err; + } + return Result; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy( + ur_queue_handle_t hQueue, ur_mem_handle_t hBufferSrc, + ur_mem_handle_t hBufferDst, size_t srcOffset, size_t dstOffset, size_t size, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + UR_ASSERT(size + dstOffset <= hBufferDst->Mem.BufferMem.getSize(), + UR_RESULT_ERROR_INVALID_SIZE); + UR_ASSERT(size + srcOffset <= hBufferSrc->Mem.BufferMem.getSize(), + UR_RESULT_ERROR_INVALID_SIZE); + + std::unique_ptr RetImplEvent{nullptr}; + + try { + ScopedContext Active(hQueue->getContext()); + ur_result_t Result = UR_RESULT_SUCCESS; + + auto Stream = hQueue->getNextTransferStream(); + Result = + enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList); + + if (phEvent) { + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_BUFFER_COPY, hQueue, Stream)); + UR_CHECK_ERROR(RetImplEvent->start()); + } + + auto Src = hBufferSrc->Mem.BufferMem.get() + srcOffset; + auto Dst = hBufferDst->Mem.BufferMem.get() + dstOffset; + + UR_CHECK_ERROR(cuMemcpyDtoDAsync(Dst, Src, size, Stream)); + + if (phEvent) { + UR_CHECK_ERROR(RetImplEvent->record()); + *phEvent = RetImplEvent.release(); + } + + return Result; + } catch (ur_result_t Err) { + return Err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( + ur_queue_handle_t hQueue, ur_mem_handle_t hBufferSrc, + ur_mem_handle_t hBufferDst, ur_rect_offset_t srcOrigin, + ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, + size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + ur_result_t Result = UR_RESULT_SUCCESS; + CUdeviceptr SrcPtr = hBufferSrc->Mem.BufferMem.get(); + CUdeviceptr DstPtr = hBufferDst->Mem.BufferMem.get(); + std::unique_ptr RetImplEvent{nullptr}; + + try { + ScopedContext Active(hQueue->getContext()); + CUstream CuStream = hQueue->getNextTransferStream(); + Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, + phEventWaitList); + + if (phEvent) { + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_BUFFER_COPY_RECT, hQueue, CuStream)); + UR_CHECK_ERROR(RetImplEvent->start()); + } + + Result = commonEnqueueMemBufferCopyRect( + CuStream, region, &SrcPtr, CU_MEMORYTYPE_DEVICE, srcOrigin, srcRowPitch, + srcSlicePitch, &DstPtr, CU_MEMORYTYPE_DEVICE, dstOrigin, dstRowPitch, + dstSlicePitch); + + if (phEvent) { + UR_CHECK_ERROR(RetImplEvent->record()); + *phEvent = RetImplEvent.release(); + } + + } catch (ur_result_t err) { + Result = err; + } + return Result; +} + +// CUDA has no memset functions that allow setting values more than 4 bytes. UR +// API lets you pass an arbitrary "pattern" to the buffer fill, which can be +// more than 4 bytes. We must break up the pattern into 1 byte values, and set +// the buffer using multiple strided calls. The first 4 patterns are set using +// cuMemsetD32Async then all subsequent 1 byte patterns are set using +// cuMemset2DAsync which is called for each pattern. +ur_result_t commonMemSetLargePattern(CUstream Stream, uint32_t PatternSize, + size_t Size, const void *pPattern, + CUdeviceptr Ptr) { + // Calculate the number of patterns, stride, number of times the pattern + // needs to be applied, and the number of times the first 32 bit pattern + // needs to be applied. + auto NumberOfSteps = PatternSize / sizeof(uint8_t); + auto Pitch = NumberOfSteps * sizeof(uint8_t); + auto Height = Size / NumberOfSteps; + auto Count32 = Size / sizeof(uint32_t); + + // Get 4-byte chunk of the pattern and call cuMemsetD32Async + auto Value = *(static_cast(pPattern)); + UR_CHECK_ERROR(cuMemsetD32Async(Ptr, Value, Count32, Stream)); + for (auto step = 4u; step < NumberOfSteps; ++step) { + // take 1 byte of the pattern + Value = *(static_cast(pPattern) + step); + + // offset the pointer to the part of the buffer we want to write to + auto OffsetPtr = Ptr + (step * sizeof(uint8_t)); + + // set all of the pattern chunks + UR_CHECK_ERROR(cuMemsetD2D8Async(OffsetPtr, Pitch, Value, sizeof(uint8_t), + Height, Stream)); + } + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( + ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, const void *pPattern, + size_t patternSize, size_t offset, size_t size, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + UR_ASSERT(size + offset <= hBuffer->Mem.BufferMem.getSize(), + UR_RESULT_ERROR_INVALID_SIZE); + + std::unique_ptr RetImplEvent{nullptr}; + + try { + ScopedContext Active(hQueue->getContext()); + + auto Stream = hQueue->getNextTransferStream(); + ur_result_t Result = + enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList); + + if (phEvent) { + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_BUFFER_FILL, hQueue, Stream)); + UR_CHECK_ERROR(RetImplEvent->start()); + } + + auto DstDevice = hBuffer->Mem.BufferMem.get() + offset; + auto N = size / patternSize; + + // pattern size in bytes + switch (patternSize) { + case 1: { + auto Value = *static_cast(pPattern); + UR_CHECK_ERROR(cuMemsetD8Async(DstDevice, Value, N, Stream)); + break; + } + case 2: { + auto Value = *static_cast(pPattern); + UR_CHECK_ERROR(cuMemsetD16Async(DstDevice, Value, N, Stream)); + break; + } + case 4: { + auto Value = *static_cast(pPattern); + UR_CHECK_ERROR(cuMemsetD32Async(DstDevice, Value, N, Stream)); + break; + } + default: { + Result = commonMemSetLargePattern(Stream, patternSize, size, pPattern, + DstDevice); + break; + } + } + + if (phEvent) { + UR_CHECK_ERROR(RetImplEvent->record()); + *phEvent = RetImplEvent.release(); + } + + return Result; + } catch (ur_result_t Err) { + return Err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } +} + +static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR ArrayDesc) { + switch (ArrayDesc.Format) { + case CU_AD_FORMAT_UNSIGNED_INT8: + case CU_AD_FORMAT_SIGNED_INT8: + return 1; + case CU_AD_FORMAT_UNSIGNED_INT16: + case CU_AD_FORMAT_SIGNED_INT16: + case CU_AD_FORMAT_HALF: + return 2; + case CU_AD_FORMAT_UNSIGNED_INT32: + case CU_AD_FORMAT_SIGNED_INT32: + case CU_AD_FORMAT_FLOAT: + return 4; + default: + detail::ur::die("Invalid image format."); + return 0; + } +} + +/// General ND memory copy operation for images (where N > 1). +/// This function requires the corresponding CUDA context to be at the top of +/// the context stack +/// If the source and/or destination is an array, SrcPtr and/or DstPtr +/// must be a pointer to a CUarray +static ur_result_t commonEnqueueMemImageNDCopy( + CUstream CuStream, ur_mem_type_t ImgType, const ur_rect_region_t Region, + const void *SrcPtr, const CUmemorytype_enum SrcType, + const ur_rect_offset_t SrcOffset, void *DstPtr, + const CUmemorytype_enum DstType, const ur_rect_offset_t DstOffset) { + UR_ASSERT(SrcType == CU_MEMORYTYPE_ARRAY || SrcType == CU_MEMORYTYPE_HOST, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(DstType == CU_MEMORYTYPE_ARRAY || DstType == CU_MEMORYTYPE_HOST, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + if (ImgType == UR_MEM_TYPE_IMAGE2D) { + CUDA_MEMCPY2D CpyDesc; + memset(&CpyDesc, 0, sizeof(CpyDesc)); + CpyDesc.srcMemoryType = SrcType; + if (SrcType == CU_MEMORYTYPE_ARRAY) { + CpyDesc.srcArray = *static_cast(SrcPtr); + CpyDesc.srcXInBytes = SrcOffset.x; + CpyDesc.srcY = SrcOffset.y; + } else { + CpyDesc.srcHost = SrcPtr; + } + CpyDesc.dstMemoryType = DstType; + if (DstType == CU_MEMORYTYPE_ARRAY) { + CpyDesc.dstArray = *static_cast(DstPtr); + CpyDesc.dstXInBytes = DstOffset.x; + CpyDesc.dstY = DstOffset.y; + } else { + CpyDesc.dstHost = DstPtr; + } + CpyDesc.WidthInBytes = Region.width; + CpyDesc.Height = Region.height; + UR_CHECK_ERROR(cuMemcpy2DAsync(&CpyDesc, CuStream)); + return UR_RESULT_SUCCESS; + } + if (ImgType == UR_MEM_TYPE_IMAGE3D) { + CUDA_MEMCPY3D CpyDesc; + memset(&CpyDesc, 0, sizeof(CpyDesc)); + CpyDesc.srcMemoryType = SrcType; + if (SrcType == CU_MEMORYTYPE_ARRAY) { + CpyDesc.srcArray = *static_cast(SrcPtr); + CpyDesc.srcXInBytes = SrcOffset.x; + CpyDesc.srcY = SrcOffset.y; + CpyDesc.srcZ = SrcOffset.z; + } else { + CpyDesc.srcHost = SrcPtr; + } + CpyDesc.dstMemoryType = DstType; + if (DstType == CU_MEMORYTYPE_ARRAY) { + CpyDesc.dstArray = *static_cast(DstPtr); + CpyDesc.dstXInBytes = DstOffset.x; + CpyDesc.dstY = DstOffset.y; + CpyDesc.dstZ = DstOffset.z; + } else { + CpyDesc.dstHost = DstPtr; + } + CpyDesc.WidthInBytes = Region.width; + CpyDesc.Height = Region.height; + CpyDesc.Depth = Region.depth; + UR_CHECK_ERROR(cuMemcpy3DAsync(&CpyDesc, CuStream)); + return UR_RESULT_SUCCESS; + } + return UR_RESULT_ERROR_INVALID_VALUE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( + ur_queue_handle_t hQueue, ur_mem_handle_t hImage, bool blockingRead, + ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch, + size_t slicePitch, void *pDst, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + std::ignore = rowPitch; + std::ignore = slicePitch; + + UR_ASSERT(hImage->MemType == ur_mem_handle_t_::Type::Surface, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + ur_result_t Result = UR_RESULT_SUCCESS; + + try { + ScopedContext Active(hQueue->getContext()); + CUstream CuStream = hQueue->getNextTransferStream(); + Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, + phEventWaitList); + + CUarray Array = hImage->Mem.SurfaceMem.getArray(); + + CUDA_ARRAY_DESCRIPTOR ArrayDesc; + UR_CHECK_ERROR(cuArrayGetDescriptor(&ArrayDesc, Array)); + + int ElementByteSize = imageElementByteSize(ArrayDesc); + + size_t ByteOffsetX = origin.x * ElementByteSize * ArrayDesc.NumChannels; + size_t BytesToCopy = ElementByteSize * ArrayDesc.NumChannels * region.width; + + ur_mem_type_t ImgType = hImage->Mem.SurfaceMem.getImageType(); + + std::unique_ptr RetImplEvent{nullptr}; + if (phEvent) { + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_IMAGE_READ, hQueue, CuStream)); + UR_CHECK_ERROR(RetImplEvent->start()); + } + if (ImgType == UR_MEM_TYPE_IMAGE1D) { + UR_CHECK_ERROR( + cuMemcpyAtoHAsync(pDst, Array, ByteOffsetX, BytesToCopy, CuStream)); + } else { + ur_rect_region_t AdjustedRegion = {BytesToCopy, region.height, + region.depth}; + ur_rect_offset_t SrcOffset = {ByteOffsetX, origin.y, origin.z}; + + Result = commonEnqueueMemImageNDCopy( + CuStream, ImgType, AdjustedRegion, &Array, CU_MEMORYTYPE_ARRAY, + SrcOffset, pDst, CU_MEMORYTYPE_HOST, ur_rect_offset_t{}); + if (Result != UR_RESULT_SUCCESS) { + return Result; + } + } + + if (phEvent) { + UR_CHECK_ERROR(RetImplEvent->record()); + *phEvent = RetImplEvent.release(); + } + + if (blockingRead) { + UR_CHECK_ERROR(cuStreamSynchronize(CuStream)); + } + } catch (ur_result_t Err) { + return Err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + return Result; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite( + ur_queue_handle_t hQueue, ur_mem_handle_t hImage, bool blockingWrite, + ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch, + size_t slicePitch, void *pSrc, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + std::ignore = blockingWrite; + std::ignore = rowPitch; + std::ignore = slicePitch; + + UR_ASSERT(hImage->MemType == ur_mem_handle_t_::Type::Surface, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + ur_result_t Result = UR_RESULT_SUCCESS; + + try { + ScopedContext Active(hQueue->getContext()); + CUstream CuStream = hQueue->getNextTransferStream(); + Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, + phEventWaitList); + + CUarray Array = hImage->Mem.SurfaceMem.getArray(); + + CUDA_ARRAY_DESCRIPTOR ArrayDesc; + UR_CHECK_ERROR(cuArrayGetDescriptor(&ArrayDesc, Array)); + + int ElementByteSize = imageElementByteSize(ArrayDesc); + + size_t ByteOffsetX = origin.x * ElementByteSize * ArrayDesc.NumChannels; + size_t BytesToCopy = ElementByteSize * ArrayDesc.NumChannels * region.width; + + std::unique_ptr RetImplEvent{nullptr}; + if (phEvent) { + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_IMAGE_WRITE, hQueue, CuStream)); + UR_CHECK_ERROR(RetImplEvent->start()); + } + + ur_mem_type_t ImgType = hImage->Mem.SurfaceMem.getImageType(); + if (ImgType == UR_MEM_TYPE_IMAGE1D) { + UR_CHECK_ERROR( + cuMemcpyHtoAAsync(Array, ByteOffsetX, pSrc, BytesToCopy, CuStream)); + } else { + ur_rect_region_t AdjustedRegion = {BytesToCopy, region.height, + region.depth}; + ur_rect_offset_t DstOffset = {ByteOffsetX, origin.y, origin.z}; + + Result = commonEnqueueMemImageNDCopy( + CuStream, ImgType, AdjustedRegion, pSrc, CU_MEMORYTYPE_HOST, + ur_rect_offset_t{}, &Array, CU_MEMORYTYPE_ARRAY, DstOffset); + + if (Result != UR_RESULT_SUCCESS) { + return Result; + } + } + + if (phEvent) { + UR_CHECK_ERROR(RetImplEvent->record()); + *phEvent = RetImplEvent.release(); + } + } catch (ur_result_t Err) { + return Err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + return Result; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy( + ur_queue_handle_t hQueue, ur_mem_handle_t hImageSrc, + ur_mem_handle_t hImageDst, ur_rect_offset_t srcOrigin, + ur_rect_offset_t dstOrigin, ur_rect_region_t region, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + UR_ASSERT(hImageSrc->MemType == ur_mem_handle_t_::Type::Surface, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(hImageDst->MemType == ur_mem_handle_t_::Type::Surface, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(hImageSrc->Mem.SurfaceMem.getImageType() == + hImageDst->Mem.SurfaceMem.getImageType(), + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + ur_result_t Result = UR_RESULT_SUCCESS; + + try { + ScopedContext Active(hQueue->getContext()); + CUstream CuStream = hQueue->getNextTransferStream(); + Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, + phEventWaitList); + + CUarray SrcArray = hImageSrc->Mem.SurfaceMem.getArray(); + CUarray DstArray = hImageDst->Mem.SurfaceMem.getArray(); + + CUDA_ARRAY_DESCRIPTOR SrcArrayDesc; + UR_CHECK_ERROR(cuArrayGetDescriptor(&SrcArrayDesc, SrcArray)); + CUDA_ARRAY_DESCRIPTOR DstArrayDesc; + UR_CHECK_ERROR(cuArrayGetDescriptor(&DstArrayDesc, DstArray)); + + UR_ASSERT(SrcArrayDesc.Format == DstArrayDesc.Format, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(SrcArrayDesc.NumChannels == DstArrayDesc.NumChannels, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + int ElementByteSize = imageElementByteSize(SrcArrayDesc); + + size_t DstByteOffsetX = + dstOrigin.x * ElementByteSize * SrcArrayDesc.NumChannels; + size_t SrcByteOffsetX = + srcOrigin.x * ElementByteSize * DstArrayDesc.NumChannels; + size_t BytesToCopy = + ElementByteSize * SrcArrayDesc.NumChannels * region.width; + + std::unique_ptr RetImplEvent{nullptr}; + if (phEvent) { + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_IMAGE_COPY, hQueue, CuStream)); + UR_CHECK_ERROR(RetImplEvent->start()); + } + + ur_mem_type_t ImgType = hImageSrc->Mem.SurfaceMem.getImageType(); + if (ImgType == UR_MEM_TYPE_IMAGE1D) { + UR_CHECK_ERROR(cuMemcpyAtoA(DstArray, DstByteOffsetX, SrcArray, + SrcByteOffsetX, BytesToCopy)); + } else { + ur_rect_region_t AdjustedRegion = {BytesToCopy, region.height, + region.depth}; + ur_rect_offset_t SrcOffset = {SrcByteOffsetX, srcOrigin.y, srcOrigin.z}; + ur_rect_offset_t DstOffset = {DstByteOffsetX, dstOrigin.y, dstOrigin.z}; + + Result = commonEnqueueMemImageNDCopy( + CuStream, ImgType, AdjustedRegion, &SrcArray, CU_MEMORYTYPE_ARRAY, + SrcOffset, &DstArray, CU_MEMORYTYPE_ARRAY, DstOffset); + if (Result != UR_RESULT_SUCCESS) { + return Result; + } + } + + if (phEvent) { + UR_CHECK_ERROR(RetImplEvent->record()); + *phEvent = RetImplEvent.release(); + } + } catch (ur_result_t Err) { + return Err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + return Result; +} + +/// Implements mapping on the host using a BufferRead operation. +/// Mapped pointers are stored in the pi_mem object. +/// If the buffer uses pinned host memory a pointer to that memory is returned +/// and no read operation is done. +/// +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( + ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingMap, + ur_map_flags_t mapFlags, size_t offset, size_t size, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent, void **ppRetMap) { + UR_ASSERT(hBuffer->MemType == ur_mem_handle_t_::Type::Buffer, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(offset + size <= hBuffer->Mem.BufferMem.getSize(), + UR_RESULT_ERROR_INVALID_SIZE); + + ur_result_t Result = UR_RESULT_ERROR_INVALID_MEM_OBJECT; + const bool IsPinned = + hBuffer->Mem.BufferMem.MemAllocMode == + ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr; + + // Currently no support for overlapping regions + if (hBuffer->Mem.BufferMem.getMapPtr() != nullptr) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + // Allocate a pointer in the host to store the mapped information + auto HostPtr = hBuffer->Mem.BufferMem.mapToPtr(size, offset, mapFlags); + *ppRetMap = hBuffer->Mem.BufferMem.getMapPtr(); + if (HostPtr) { + Result = UR_RESULT_SUCCESS; + } + + if (!IsPinned && + ((mapFlags & UR_MAP_FLAG_READ) || (mapFlags & UR_MAP_FLAG_WRITE))) { + // Pinned host memory is already on host so it doesn't need to be read. + Result = urEnqueueMemBufferRead(hQueue, hBuffer, blockingMap, offset, size, + HostPtr, numEventsInWaitList, + phEventWaitList, phEvent); + } else { + ScopedContext Active(hQueue->getContext()); + + if (IsPinned) { + Result = urEnqueueEventsWait(hQueue, numEventsInWaitList, phEventWaitList, + nullptr); + } + + if (phEvent) { + try { + *phEvent = ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_BUFFER_MAP, hQueue, hQueue->getNextTransferStream()); + UR_CHECK_ERROR((*phEvent)->start()); + UR_CHECK_ERROR((*phEvent)->record()); + } catch (ur_result_t Err) { + Result = Err; + } + } + } + + return Result; +} + +/// Implements the unmap from the host, using a BufferWrite operation. +/// Requires the mapped pointer to be already registered in the given memobj. +/// If memobj uses pinned host memory, this will not do a write. +/// +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( + ur_queue_handle_t hQueue, ur_mem_handle_t hMem, void *pMappedPtr, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + ur_result_t Result = UR_RESULT_SUCCESS; + UR_ASSERT(hMem->MemType == ur_mem_handle_t_::Type::Buffer, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(hMem->Mem.BufferMem.getMapPtr() != nullptr, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(hMem->Mem.BufferMem.getMapPtr() == pMappedPtr, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + const bool IsPinned = + hMem->Mem.BufferMem.MemAllocMode == + ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr; + + if (!IsPinned && (hMem->Mem.BufferMem.getMapFlags() & UR_MAP_FLAG_WRITE)) { + // Pinned host memory is only on host so it doesn't need to be written to. + Result = urEnqueueMemBufferWrite( + hQueue, hMem, true, hMem->Mem.BufferMem.getMapOffset(), + hMem->Mem.BufferMem.getMapSize(), pMappedPtr, numEventsInWaitList, + phEventWaitList, phEvent); + } else { + ScopedContext Active(hQueue->getContext()); + + if (IsPinned) { + Result = urEnqueueEventsWait(hQueue, numEventsInWaitList, phEventWaitList, + nullptr); + } + + if (phEvent) { + try { + *phEvent = ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_UNMAP, hQueue, hQueue->getNextTransferStream()); + UR_CHECK_ERROR((*phEvent)->start()); + UR_CHECK_ERROR((*phEvent)->record()); + } catch (ur_result_t Err) { + Result = Err; + } + } + } + + hMem->Mem.BufferMem.unmap(pMappedPtr); + return Result; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( + ur_queue_handle_t hQueue, void *ptr, size_t patternSize, + const void *pPattern, size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + ur_result_t Result = UR_RESULT_SUCCESS; + std::unique_ptr EventPtr{nullptr}; + + try { + ScopedContext Active(hQueue->getContext()); + uint32_t StreamToken; + ur_stream_guard_ Guard; + CUstream CuStream = hQueue->getNextComputeStream( + numEventsInWaitList, phEventWaitList, Guard, &StreamToken); + UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, + phEventWaitList)); + if (phEvent) { + EventPtr = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_USM_FILL, hQueue, CuStream, StreamToken)); + UR_CHECK_ERROR(EventPtr->start()); + } + + auto N = size / patternSize; + switch (patternSize) { + case 1: + UR_CHECK_ERROR(cuMemsetD8Async( + (CUdeviceptr)ptr, *((const uint8_t *)pPattern) & 0xFF, N, CuStream)); + break; + case 2: + UR_CHECK_ERROR(cuMemsetD16Async((CUdeviceptr)ptr, + *((const uint16_t *)pPattern) & 0xFFFF, N, + CuStream)); + break; + case 4: + UR_CHECK_ERROR(cuMemsetD32Async( + (CUdeviceptr)ptr, *((const uint32_t *)pPattern) & 0xFFFFFFFF, N, + CuStream)); + break; + default: + commonMemSetLargePattern(CuStream, patternSize, size, pPattern, + (CUdeviceptr)ptr); + break; + } + if (phEvent) { + UR_CHECK_ERROR(EventPtr->record()); + *phEvent = EventPtr.release(); + } + } catch (ur_result_t Err) { + Result = Err; + } + return Result; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy( + ur_queue_handle_t hQueue, bool blocking, void *pDst, const void *pSrc, + size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + ur_result_t Result = UR_RESULT_SUCCESS; + + std::unique_ptr EventPtr{nullptr}; + + try { + ScopedContext Active(hQueue->getContext()); + CUstream CuStream = hQueue->getNextTransferStream(); + Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, + phEventWaitList); + if (phEvent) { + EventPtr = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_USM_MEMCPY, hQueue, CuStream)); + UR_CHECK_ERROR(EventPtr->start()); + } + UR_CHECK_ERROR( + cuMemcpyAsync((CUdeviceptr)pDst, (CUdeviceptr)pSrc, size, CuStream)); + if (phEvent) { + UR_CHECK_ERROR(EventPtr->record()); + } + if (blocking) { + UR_CHECK_ERROR(cuStreamSynchronize(CuStream)); + } + if (phEvent) { + *phEvent = EventPtr.release(); + } + } catch (ur_result_t Err) { + Result = Err; + } + return Result; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( + ur_queue_handle_t hQueue, const void *pMem, size_t size, + ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + unsigned int PointerRangeSize = 0; + UR_CHECK_ERROR(cuPointerGetAttribute( + &PointerRangeSize, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem)); + UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE); + ur_device_handle_t Device = hQueue->getContext()->getDevice(); + + // Certain cuda devices and Windows do not have support for some Unified + // Memory features. cuMemPrefetchAsync requires concurrent memory access + // for managed memory. Therfore, ignore prefetch hint if concurrent managed + // memory access is not available. + if (!getAttribute(Device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { + setErrorMessage("Prefetch hint ignored as device does not support " + "concurrent managed access", + UR_RESULT_SUCCESS); + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } + + unsigned int IsManaged; + UR_CHECK_ERROR(cuPointerGetAttribute( + &IsManaged, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem)); + if (!IsManaged) { + setErrorMessage("Prefetch hint ignored as prefetch only works with USM", + UR_RESULT_SUCCESS); + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } + + // flags is currently unused so fail if set + if (flags != 0) + return UR_RESULT_ERROR_INVALID_VALUE; + + ur_result_t Result = UR_RESULT_SUCCESS; + std::unique_ptr EventPtr{nullptr}; + + try { + ScopedContext Active(hQueue->getContext()); + CUstream CuStream = hQueue->getNextTransferStream(); + Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, + phEventWaitList); + if (phEvent) { + EventPtr = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_BUFFER_COPY, hQueue, CuStream)); + UR_CHECK_ERROR(EventPtr->start()); + } + UR_CHECK_ERROR( + cuMemPrefetchAsync((CUdeviceptr)pMem, size, Device->get(), CuStream)); + if (phEvent) { + UR_CHECK_ERROR(EventPtr->record()); + *phEvent = EventPtr.release(); + } + } catch (ur_result_t Err) { + Result = Err; + } + return Result; +} + +/// USM: memadvise API to govern behavior of automatic migration mechanisms +UR_APIEXPORT ur_result_t UR_APICALL +urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, + ur_usm_advice_flags_t advice, ur_event_handle_t *phEvent) { + unsigned int PointerRangeSize = 0; + UR_CHECK_ERROR(cuPointerGetAttribute( + &PointerRangeSize, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem)); + UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE); + + // Certain cuda devices and Windows do not have support for some Unified + // Memory features. Passing CU_MEM_ADVISE_SET/CLEAR_PREFERRED_LOCATION and + // to cuMemAdvise on a GPU device requires the GPU device to report a non-zero + // value for CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Therfore, ignore + // memory advise if concurrent managed memory access is not available. + if ((advice & UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION) || + (advice & UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION) || + (advice & UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE) || + (advice & UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_DEVICE) || + (advice & UR_USM_ADVICE_FLAG_DEFAULT)) { + ur_device_handle_t Device = hQueue->getContext()->getDevice(); + if (!getAttribute(Device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { + setErrorMessage("Mem advise ignored as device does not support " + "concurrent managed access", + UR_RESULT_SUCCESS); + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } + + // TODO: If ptr points to valid system-allocated pageable memory we should + // check that the device also has the + // CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS property. + } + + unsigned int IsManaged; + UR_CHECK_ERROR(cuPointerGetAttribute( + &IsManaged, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem)); + if (!IsManaged) { + setErrorMessage( + "Memory advice ignored as memory advices only works with USM", + UR_RESULT_SUCCESS); + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } + + ur_result_t Result = UR_RESULT_SUCCESS; + std::unique_ptr EventPtr{nullptr}; + + try { + ScopedContext Active(hQueue->getContext()); + + if (phEvent) { + EventPtr = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_USM_ADVISE, hQueue, hQueue->getNextTransferStream())); + UR_CHECK_ERROR(EventPtr->start()); + } + + if (advice & UR_USM_ADVICE_FLAG_DEFAULT) { + UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size, + CU_MEM_ADVISE_UNSET_READ_MOSTLY, + hQueue->getContext()->getDevice()->get())); + UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size, + CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION, + hQueue->getContext()->getDevice()->get())); + UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size, + CU_MEM_ADVISE_UNSET_ACCESSED_BY, + hQueue->getContext()->getDevice()->get())); + } else { + Result = setCuMemAdvise((CUdeviceptr)pMem, size, advice, + hQueue->getContext()->getDevice()->get()); + } + + if (phEvent) { + UR_CHECK_ERROR(EventPtr->record()); + *phEvent = EventPtr.release(); + } + } catch (ur_result_t err) { + Result = err; + } catch (...) { + Result = UR_RESULT_ERROR_UNKNOWN; + } + return Result; +} + +// TODO: Implement this. Remember to return true for +// PI_EXT_ONEAPI_CONTEXT_INFO_USM_FILL2D_SUPPORT when it is implemented. +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill2D( + ur_queue_handle_t, void *, size_t, size_t, const void *, size_t, size_t, + uint32_t, const ur_event_handle_t *, ur_event_handle_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( + ur_queue_handle_t hQueue, bool blocking, void *pDst, size_t dstPitch, + const void *pSrc, size_t srcPitch, size_t width, size_t height, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + ur_result_t result = UR_RESULT_SUCCESS; + + try { + ScopedContext active(hQueue->getContext()); + CUstream cuStream = hQueue->getNextTransferStream(); + result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, + phEventWaitList); + + std::unique_ptr RetImplEvent{nullptr}; + if (phEvent) { + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_BUFFER_COPY_RECT, hQueue, cuStream)); + UR_CHECK_ERROR(RetImplEvent->start()); + } + + // Determine the direction of copy using cuPointerGetAttribute + // for both the SrcPtr and DstPtr + CUDA_MEMCPY2D CpyDesc = {}; + memset(&CpyDesc, 0, sizeof(CpyDesc)); + + getUSMHostOrDevicePtr(pSrc, &CpyDesc.srcMemoryType, &CpyDesc.srcDevice, + &CpyDesc.srcHost); + getUSMHostOrDevicePtr(pDst, &CpyDesc.dstMemoryType, &CpyDesc.dstDevice, + &CpyDesc.dstHost); + + CpyDesc.dstPitch = dstPitch; + CpyDesc.srcPitch = srcPitch; + CpyDesc.WidthInBytes = width; + CpyDesc.Height = height; + + UR_CHECK_ERROR(cuMemcpy2DAsync(&CpyDesc, cuStream)); + + if (phEvent) { + UR_CHECK_ERROR(RetImplEvent->record()); + *phEvent = RetImplEvent.release(); + } + if (blocking) { + UR_CHECK_ERROR(cuStreamSynchronize(cuStream)); + } + } catch (ur_result_t err) { + result = err; + } + return result; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( + ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingRead, + size_t offset, size_t size, void *pDst, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + UR_ASSERT(!hBuffer->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(offset + size <= hBuffer->Mem.BufferMem.Size, + UR_RESULT_ERROR_INVALID_SIZE); + + ur_result_t Result = UR_RESULT_SUCCESS; + CUdeviceptr DevPtr = hBuffer->Mem.BufferMem.get(); + std::unique_ptr RetImplEvent{nullptr}; + + try { + ScopedContext Active(hQueue->getContext()); + CUstream CuStream = hQueue->getNextTransferStream(); + + Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, + phEventWaitList); + + if (phEvent) { + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_BUFFER_READ, hQueue, CuStream)); + UR_CHECK_ERROR(RetImplEvent->start()); + } + + UR_CHECK_ERROR(cuMemcpyDtoHAsync(pDst, DevPtr + offset, size, CuStream)); + + if (phEvent) { + UR_CHECK_ERROR(RetImplEvent->record()); + } + + if (blockingRead) { + UR_CHECK_ERROR(cuStreamSynchronize(CuStream)); + } + + if (phEvent) { + *phEvent = RetImplEvent.release(); + } + + } catch (ur_result_t Err) { + Result = Err; + } + + return Result; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( + ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingWrite, + size_t offset, size_t size, const void *pSrc, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + UR_ASSERT(!hBuffer->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(offset + size <= hBuffer->Mem.BufferMem.Size, + UR_RESULT_ERROR_INVALID_SIZE); + + ur_result_t Result = UR_RESULT_SUCCESS; + CUdeviceptr DevPtr = hBuffer->Mem.BufferMem.get(); + std::unique_ptr RetImplEvent{nullptr}; + + try { + ScopedContext Active(hQueue->getContext()); + CUstream CuStream = hQueue->getNextTransferStream(); + + Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, + phEventWaitList); + + if (phEvent) { + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_MEM_BUFFER_WRITE, hQueue, CuStream)); + UR_CHECK_ERROR(RetImplEvent->start()); + } + + UR_CHECK_ERROR(cuMemcpyHtoDAsync(DevPtr + offset, pSrc, size, CuStream)); + + if (phEvent) { + UR_CHECK_ERROR(RetImplEvent->record()); + } + + if (blockingWrite) { + UR_CHECK_ERROR(cuStreamSynchronize(CuStream)); + } + + if (phEvent) { + *phEvent = RetImplEvent.release(); + } + } catch (ur_result_t Err) { + Result = Err; + } + return Result; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite( + ur_queue_handle_t hQueue, ur_program_handle_t hProgram, const char *name, + bool blockingWrite, size_t count, size_t offset, const void *pSrc, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + // Since CUDA requires a the global variable to be referenced by name, we use + // metadata to find the correct name to access it by. + auto DeviceGlobalNameIt = hProgram->GlobalIDMD.find(name); + if (DeviceGlobalNameIt == hProgram->GlobalIDMD.end()) + return UR_RESULT_ERROR_INVALID_VALUE; + std::string DeviceGlobalName = DeviceGlobalNameIt->second; + + ur_result_t Result = UR_RESULT_SUCCESS; + try { + CUdeviceptr DeviceGlobal = 0; + size_t DeviceGlobalSize = 0; + UR_CHECK_ERROR(cuModuleGetGlobal(&DeviceGlobal, &DeviceGlobalSize, + hProgram->get(), + DeviceGlobalName.c_str())); + + if (offset + count > DeviceGlobalSize) + return UR_RESULT_ERROR_INVALID_VALUE; + + return urEnqueueUSMMemcpy( + hQueue, blockingWrite, reinterpret_cast(DeviceGlobal + offset), + pSrc, count, numEventsInWaitList, phEventWaitList, phEvent); + } catch (ur_result_t Err) { + Result = Err; + } + return Result; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead( + ur_queue_handle_t hQueue, ur_program_handle_t hProgram, const char *name, + bool blockingRead, size_t count, size_t offset, void *pDst, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + // Since CUDA requires a the global variable to be referenced by name, we use + // metadata to find the correct name to access it by. + auto DeviceGlobalNameIt = hProgram->GlobalIDMD.find(name); + if (DeviceGlobalNameIt == hProgram->GlobalIDMD.end()) + return UR_RESULT_ERROR_INVALID_VALUE; + std::string DeviceGlobalName = DeviceGlobalNameIt->second; + + ur_result_t Result = UR_RESULT_SUCCESS; + try { + CUdeviceptr DeviceGlobal = 0; + size_t DeviceGlobalSize = 0; + UR_CHECK_ERROR(cuModuleGetGlobal(&DeviceGlobal, &DeviceGlobalSize, + hProgram->get(), + DeviceGlobalName.c_str())); + + if (offset + count > DeviceGlobalSize) + return UR_RESULT_ERROR_INVALID_VALUE; + + return urEnqueueUSMMemcpy( + hQueue, blockingRead, pDst, + reinterpret_cast(DeviceGlobal + offset), count, + numEventsInWaitList, phEventWaitList, phEvent); + } catch (ur_result_t Err) { + Result = Err; + } + return Result; +} + +/// Host Pipes +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueReadHostPipe( + ur_queue_handle_t hQueue, ur_program_handle_t hProgram, + const char *pipe_symbol, bool blocking, void *pDst, size_t size, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + (void)hQueue; + (void)hProgram; + (void)pipe_symbol; + (void)blocking; + (void)pDst; + (void)size; + (void)numEventsInWaitList; + (void)phEventWaitList; + (void)phEvent; + + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueWriteHostPipe( + ur_queue_handle_t hQueue, ur_program_handle_t hProgram, + const char *pipe_symbol, bool blocking, void *pSrc, size_t size, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + (void)hQueue; + (void)hProgram; + (void)pipe_symbol; + (void)blocking; + (void)pSrc; + (void)size; + (void)numEventsInWaitList; + (void)phEventWaitList; + (void)phEvent; + + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/source/adapters/cuda/enqueue.hpp b/source/adapters/cuda/enqueue.hpp new file mode 100644 index 0000000000..d49853b38d --- /dev/null +++ b/source/adapters/cuda/enqueue.hpp @@ -0,0 +1,16 @@ +//===--------- enqueue.hpp - CUDA Adapter ---------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#pragma once + +#include +#include +#include + +ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream, + uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList); diff --git a/source/adapters/cuda/event.cpp b/source/adapters/cuda/event.cpp new file mode 100644 index 0000000000..18d861c4e9 --- /dev/null +++ b/source/adapters/cuda/event.cpp @@ -0,0 +1,295 @@ +//===--------- event.cpp - CUDA Adapter -----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "event.hpp" +#include "common.hpp" +#include "context.hpp" +#include "device.hpp" +#include "queue.hpp" + +#include +#include + +ur_event_handle_t_::ur_event_handle_t_(ur_command_t Type, + ur_context_handle_t Context, + ur_queue_handle_t Queue, CUstream Stream, + uint32_t StreamToken) + : CommandType{Type}, RefCount{1}, HasOwnership{true}, + HasBeenWaitedOn{false}, IsRecorded{false}, IsStarted{false}, + StreamToken{StreamToken}, EvEnd{nullptr}, EvStart{nullptr}, + EvQueued{nullptr}, Queue{Queue}, Stream{Stream}, Context{Context} { + + bool ProfilingEnabled = Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE; + + UR_CHECK_ERROR(cuEventCreate( + &EvEnd, ProfilingEnabled ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING)); + + if (ProfilingEnabled) { + UR_CHECK_ERROR(cuEventCreate(&EvQueued, CU_EVENT_DEFAULT)); + UR_CHECK_ERROR(cuEventCreate(&EvStart, CU_EVENT_DEFAULT)); + } + + if (Queue != nullptr) { + urQueueRetain(Queue); + } + urContextRetain(Context); +} + +ur_event_handle_t_::ur_event_handle_t_(ur_context_handle_t Context, + CUevent EventNative) + : CommandType{UR_COMMAND_EVENTS_WAIT}, RefCount{1}, HasOwnership{false}, + HasBeenWaitedOn{false}, IsRecorded{false}, IsStarted{false}, + StreamToken{std::numeric_limits::max()}, EvEnd{EventNative}, + EvStart{nullptr}, EvQueued{nullptr}, Queue{nullptr}, Context{Context} { + urContextRetain(Context); +} + +ur_event_handle_t_::~ur_event_handle_t_() { + if (Queue != nullptr) { + urQueueRelease(Queue); + } + urContextRelease(Context); +} + +ur_result_t ur_event_handle_t_::start() { + assert(!isStarted()); + ur_result_t Result = UR_RESULT_SUCCESS; + + try { + if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) { + // NOTE: This relies on the default stream to be unused. + UR_CHECK_ERROR(cuEventRecord(EvQueued, 0)); + UR_CHECK_ERROR(cuEventRecord(EvStart, Stream)); + } + } catch (ur_result_t Err) { + Result = Err; + } + + IsStarted = true; + return Result; +} + +bool ur_event_handle_t_::isCompleted() const noexcept { + if (!IsRecorded) { + return false; + } + if (!HasBeenWaitedOn) { + const CUresult Result = cuEventQuery(EvEnd); + if (Result != CUDA_SUCCESS && Result != CUDA_ERROR_NOT_READY) { + UR_CHECK_ERROR(Result); + return false; + } + if (Result == CUDA_ERROR_NOT_READY) { + return false; + } + } + return true; +} + +uint64_t ur_event_handle_t_::getQueuedTime() const { + assert(isStarted()); + return Queue->get_device()->getElapsedTime(EvQueued); +} + +uint64_t ur_event_handle_t_::getStartTime() const { + assert(isStarted()); + return Queue->get_device()->getElapsedTime(EvStart); +} + +uint64_t ur_event_handle_t_::getEndTime() const { + assert(isStarted() && isRecorded()); + return Queue->get_device()->getElapsedTime(EvEnd); +} + +ur_result_t ur_event_handle_t_::record() { + + if (isRecorded() || !isStarted()) { + return UR_RESULT_ERROR_INVALID_EVENT; + } + + ur_result_t Result = UR_RESULT_SUCCESS; + + UR_ASSERT(Queue, UR_RESULT_ERROR_INVALID_QUEUE); + + try { + EventID = Queue->getNextEventID(); + if (EventID == 0) { + detail::ur::die( + "Unrecoverable program state reached in event identifier overflow"); + } + UR_CHECK_ERROR(cuEventRecord(EvEnd, Stream)); + } catch (ur_result_t error) { + Result = error; + } + + if (Result == UR_RESULT_SUCCESS) { + IsRecorded = true; + } + + return Result; +} + +ur_result_t ur_event_handle_t_::wait() { + ur_result_t Result = UR_RESULT_SUCCESS; + try { + UR_CHECK_ERROR(cuEventSynchronize(EvEnd)); + HasBeenWaitedOn = true; + } catch (ur_result_t error) { + Result = error; + } + + return Result; +} + +ur_result_t ur_event_handle_t_::release() { + if (!backendHasOwnership()) + return UR_RESULT_SUCCESS; + + assert(Queue != nullptr); + + UR_CHECK_ERROR(cuEventDestroy(EvEnd)); + + if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) { + UR_CHECK_ERROR(cuEventDestroy(EvQueued)); + UR_CHECK_ERROR(cuEventDestroy(EvStart)); + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo(ur_event_handle_t hEvent, + ur_event_info_t propName, + size_t propValueSize, + void *pPropValue, + size_t *pPropValueSizeRet) { + UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet); + + switch (propName) { + case UR_EVENT_INFO_COMMAND_QUEUE: + return ReturnValue(hEvent->getQueue()); + case UR_EVENT_INFO_COMMAND_TYPE: + return ReturnValue(hEvent->getCommandType()); + case UR_EVENT_INFO_REFERENCE_COUNT: + return ReturnValue(hEvent->getReferenceCount()); + case UR_EVENT_INFO_COMMAND_EXECUTION_STATUS: + return ReturnValue(hEvent->getExecutionStatus()); + case UR_EVENT_INFO_CONTEXT: + return ReturnValue(hEvent->getContext()); + default: + detail::ur::die("Event info request not implemented"); + } + + return UR_RESULT_ERROR_INVALID_ENUMERATION; +} + +/// Obtain profiling information from PI CUDA events +/// \TODO Timings from CUDA are only elapsed time. +UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( + ur_event_handle_t hEvent, ur_profiling_info_t propName, + size_t propValueSize, void *pPropValue, size_t *pPropValueSizeRet) { + UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet); + + ur_queue_handle_t Queue = hEvent->getQueue(); + if (Queue == nullptr || !(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE)) { + return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE; + } + + switch (propName) { + case UR_PROFILING_INFO_COMMAND_QUEUED: + case UR_PROFILING_INFO_COMMAND_SUBMIT: + // Note: No user for this case + return ReturnValue(static_cast(hEvent->getQueuedTime())); + case UR_PROFILING_INFO_COMMAND_START: + return ReturnValue(static_cast(hEvent->getStartTime())); + case UR_PROFILING_INFO_COMMAND_END: + return ReturnValue(static_cast(hEvent->getEndTime())); + default: + break; + } + detail::ur::die("Event Profiling info request not implemented"); + return {}; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEventSetCallback(ur_event_handle_t, + ur_execution_info_t, + ur_event_callback_t, + void *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) { + try { + auto Context = phEventWaitList[0]->getContext(); + ScopedContext Active(Context); + + auto WaitFunc = [Context](ur_event_handle_t Event) -> ur_result_t { + UR_ASSERT(Event, UR_RESULT_ERROR_INVALID_EVENT); + UR_ASSERT(Event->getContext() == Context, + UR_RESULT_ERROR_INVALID_CONTEXT); + + return Event->wait(); + }; + return forLatestEvents(phEventWaitList, numEvents, WaitFunc); + } catch (ur_result_t Err) { + return Err; + } catch (...) { + return UR_RESULT_ERROR_OUT_OF_RESOURCES; + } +} + +UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(ur_event_handle_t hEvent) { + const auto RefCount = hEvent->incrementReferenceCount(); + + detail::ur::assertion(RefCount != 0, + "Reference count overflow detected in urEventRetain."); + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) { + // double delete or someone is messing with the ref count. + // either way, cannot safely proceed. + detail::ur::assertion(hEvent->getReferenceCount() != 0, + "Reference count overflow detected in urEventRelease."); + + // decrement ref count. If it is 0, delete the event. + if (hEvent->decrementReferenceCount() == 0) { + std::unique_ptr event_ptr{hEvent}; + ur_result_t Result = UR_RESULT_ERROR_INVALID_EVENT; + try { + ScopedContext Active(hEvent->getContext()); + Result = hEvent->release(); + } catch (...) { + Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; + } + return Result; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEventGetNativeHandle( + ur_event_handle_t hEvent, ur_native_handle_t *phNativeEvent) { + *phNativeEvent = reinterpret_cast(hEvent->get()); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle( + ur_native_handle_t hNativeEvent, ur_context_handle_t hContext, + const ur_event_native_properties_t *pProperties, + ur_event_handle_t *phEvent) { + std::ignore = pProperties; + + std::unique_ptr EventPtr{nullptr}; + + *phEvent = ur_event_handle_t_::makeWithNative( + hContext, reinterpret_cast(hNativeEvent)); + + return UR_RESULT_SUCCESS; +} diff --git a/source/adapters/cuda/event.hpp b/source/adapters/cuda/event.hpp new file mode 100644 index 0000000000..4c788532c2 --- /dev/null +++ b/source/adapters/cuda/event.hpp @@ -0,0 +1,189 @@ +//===--------- event.hpp - CUDA Adapter -----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#pragma once + +#include +#include + +#include "queue.hpp" + +/// UR Event mapping to CUevent +/// +struct ur_event_handle_t_ { +public: + using native_type = CUevent; + + ur_result_t record(); + + ur_result_t wait(); + + ur_result_t start(); + + native_type get() const noexcept { return EvEnd; }; + + ur_queue_handle_t getQueue() const noexcept { return Queue; } + + CUstream getStream() const noexcept { return Stream; } + + uint32_t getComputeStreamToken() const noexcept { return StreamToken; } + + ur_command_t getCommandType() const noexcept { return CommandType; } + + uint32_t getReferenceCount() const noexcept { return RefCount; } + + bool isRecorded() const noexcept { return IsRecorded; } + + bool isStarted() const noexcept { return IsStarted; } + + bool isCompleted() const noexcept; + + uint32_t getExecutionStatus() const noexcept { + + if (!isRecorded()) { + return UR_EVENT_STATUS_SUBMITTED; + } + + if (!isCompleted()) { + return UR_EVENT_STATUS_RUNNING; + } + return UR_EVENT_STATUS_COMPLETE; + } + + ur_context_handle_t getContext() const noexcept { return Context; }; + + uint32_t incrementReferenceCount() { return ++RefCount; } + + uint32_t decrementReferenceCount() { return --RefCount; } + + uint32_t getEventID() const noexcept { return EventID; } + + bool backendHasOwnership() const noexcept { return HasOwnership; } + + // Returns the counter time when the associated command(s) were enqueued + // + uint64_t getQueuedTime() const; + + // Returns the counter time when the associated command(s) started execution + // + uint64_t getStartTime() const; + + // Returns the counter time when the associated command(s) completed + // + uint64_t getEndTime() const; + + // construct a native CUDA. This maps closely to the underlying CUDA event. + static ur_event_handle_t + makeNative(ur_command_t Type, ur_queue_handle_t Queue, CUstream Stream, + uint32_t StreamToken = std::numeric_limits::max()) { + return new ur_event_handle_t_(Type, Queue->getContext(), Queue, Stream, + StreamToken); + } + + static ur_event_handle_t makeWithNative(ur_context_handle_t context, + CUevent eventNative) { + return new ur_event_handle_t_(context, eventNative); + } + + ur_result_t release(); + + ~ur_event_handle_t_(); + +private: + // This constructor is private to force programmers to use the makeNative / + // make_user static members in order to create a pi_event for CUDA. + ur_event_handle_t_(ur_command_t Type, ur_context_handle_t Context, + ur_queue_handle_t Queue, CUstream Stream, + uint32_t StreamToken); + + // This constructor is private to force programmers to use the + // makeWithNative for event interop + ur_event_handle_t_(ur_context_handle_t Context, CUevent EventNative); + + ur_command_t CommandType; // The type of command associated with event. + + std::atomic_uint32_t RefCount; // Event reference count. + + bool HasOwnership; // Signifies if event owns the native type. + + bool HasBeenWaitedOn; // Signifies whether the event has been waited + // on through a call to wait(), which implies + // that it has completed. + + bool IsRecorded; // Signifies wether a native CUDA event has been recorded + // yet. + bool IsStarted; // Signifies wether the operation associated with the + // UR event has started or not + + uint32_t StreamToken; + uint32_t EventID; // Queue identifier of the event. + + native_type EvEnd; // CUDA event handle. If this ur_event_handle_t represents + // a user event, this will be nullptr. + + native_type EvStart; // CUDA event handle associated with the start + + native_type EvQueued; // CUDA event handle associated with the time + // the command was enqueued + + ur_queue_handle_t Queue; // ur_queue_handle_t associated with the event. If + // this is a user event, this will be nullptr. + + CUstream Stream; // CUstream associated with the event. If this is a user + // event, this will be uninitialized. + + ur_context_handle_t Context; // ur_context_handle_t associated with the event. + // If this is a native event, this will be the + // same context associated with the queue member. +}; + +// Iterate over `event_wait_list` and apply the given callback `f` to the +// latest event on each queue therein. The callback must take a single +// ur_event_handle_t argument and return a ur_result_t. If the callback returns +// an error, the iteration terminates and the error is returned. +template +ur_result_t forLatestEvents(const ur_event_handle_t *EventWaitList, + std::size_t NumEventsInWaitList, Func &&F) { + + if (EventWaitList == nullptr || NumEventsInWaitList == 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + // Fast path if we only have a single event + if (NumEventsInWaitList == 1) { + return F(EventWaitList[0]); + } + + std::vector Events{EventWaitList, + EventWaitList + NumEventsInWaitList}; + std::sort(Events.begin(), Events.end(), + [](ur_event_handle_t Event0, ur_event_handle_t Event1) { + // Tiered sort creating sublists of streams (smallest value first) + // in which the corresponding events are sorted into a sequence of + // newest first. + return Event0->getStream() < Event1->getStream() || + (Event0->getStream() == Event1->getStream() && + Event0->getEventID() > Event1->getEventID()); + }); + + CUstream LastSeenStream = 0; + for (size_t i = 0; i < Events.size(); i++) { + auto Event = Events[i]; + if (!Event || (i != 0 && Event->getStream() == LastSeenStream)) { + continue; + } + + LastSeenStream = Event->getStream(); + + auto Result = F(Event); + if (Result != UR_RESULT_SUCCESS) { + return Result; + } + } + + return UR_RESULT_SUCCESS; +} diff --git a/source/adapters/cuda/image.cpp b/source/adapters/cuda/image.cpp new file mode 100644 index 0000000000..1b11cade5c --- /dev/null +++ b/source/adapters/cuda/image.cpp @@ -0,0 +1,1061 @@ +//===--------- image.cpp - CUDA Adapter -----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +#include "common.hpp" +#include "context.hpp" +#include "enqueue.hpp" +#include "event.hpp" +#include "image.hpp" +#include "memory.hpp" +#include "queue.hpp" +#include "sampler.hpp" +#include "ur/ur.hpp" +#include "ur_api.h" + +ur_result_t urCalculateNumChannels(ur_image_channel_order_t order, + unsigned int *NumChannels) { + switch (order) { + case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_A: + case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_R: + *NumChannels = 1; + return UR_RESULT_SUCCESS; + case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_RG: + case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_RA: + *NumChannels = 2; + return UR_RESULT_SUCCESS; + case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_RGB: + return UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED; + case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_RGBA: + case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_ARGB: + case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_BGRA: + case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_ABGR: + *NumChannels = 4; + return UR_RESULT_SUCCESS; + case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_RX: + case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_RGX: + case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_RGBX: + case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_SRGBA: + case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_INTENSITY: + case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_LUMINANCE: + default: + return UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED; + } +} + +/// Convert a UR image format to a CUDA image format and +/// get the pixel size in bytes. +/// /param image_channel_type is the ur_image_channel_type_t. +/// /param image_channel_order is the ur_image_channel_order_t. +/// this is used for normalized channel formats, as CUDA +/// combines the channel format and order for normalized +/// channel types. +/// /param return_cuda_format will be set to the equivalent cuda +/// format if not nullptr. +/// /param return_pixel_size_bytes will be set to the pixel +/// byte size if not nullptr. +ur_result_t +urToCudaImageChannelFormat(ur_image_channel_type_t image_channel_type, + ur_image_channel_order_t image_channel_order, + CUarray_format *return_cuda_format, + size_t *return_pixel_size_bytes) { + + CUarray_format cuda_format; + size_t pixel_size_bytes = 0; + unsigned int num_channels = 0; + UR_CHECK_ERROR(urCalculateNumChannels(image_channel_order, &num_channels)); + + switch (image_channel_type) { +#define CASE(FROM, TO, SIZE) \ + case FROM: { \ + cuda_format = TO; \ + pixel_size_bytes = SIZE * num_channels; \ + break; \ + } + + CASE(UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8, CU_AD_FORMAT_UNSIGNED_INT8, 1) + CASE(UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8, CU_AD_FORMAT_SIGNED_INT8, 1) + CASE(UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16, CU_AD_FORMAT_UNSIGNED_INT16, 2) + CASE(UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16, CU_AD_FORMAT_SIGNED_INT16, 2) + CASE(UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT, CU_AD_FORMAT_HALF, 2) + CASE(UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32, CU_AD_FORMAT_UNSIGNED_INT32, 4) + CASE(UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32, CU_AD_FORMAT_SIGNED_INT32, 4) + CASE(UR_IMAGE_CHANNEL_TYPE_FLOAT, CU_AD_FORMAT_FLOAT, 4) + +#undef CASE + default: + break; + } + + // These new formats were brought in in CUDA 11.5 +#if CUDA_VERSION >= 11050 + + // If none of the above channel types were passed, check those below + if (pixel_size_bytes == 0) { + + // We can't use a switch statement here because these single + // UR_IMAGE_CHANNEL_TYPEs can correspond to multiple [u/s]norm CU_AD_FORMATs + // depending on the number of channels. We use a std::map instead to + // retrieve the correct CUDA format + + // map < , > + const std::map, + std::pair> + norm_channel_type_map{ + {{UR_IMAGE_CHANNEL_TYPE_UNORM_INT8, 1}, + {CU_AD_FORMAT_UNORM_INT8X1, 1}}, + {{UR_IMAGE_CHANNEL_TYPE_UNORM_INT8, 2}, + {CU_AD_FORMAT_UNORM_INT8X2, 2}}, + {{UR_IMAGE_CHANNEL_TYPE_UNORM_INT8, 4}, + {CU_AD_FORMAT_UNORM_INT8X4, 4}}, + + {{UR_IMAGE_CHANNEL_TYPE_SNORM_INT8, 1}, + {CU_AD_FORMAT_SNORM_INT8X1, 1}}, + {{UR_IMAGE_CHANNEL_TYPE_SNORM_INT8, 2}, + {CU_AD_FORMAT_SNORM_INT8X2, 2}}, + {{UR_IMAGE_CHANNEL_TYPE_SNORM_INT8, 4}, + {CU_AD_FORMAT_SNORM_INT8X4, 4}}, + + {{UR_IMAGE_CHANNEL_TYPE_UNORM_INT16, 1}, + {CU_AD_FORMAT_UNORM_INT16X1, 2}}, + {{UR_IMAGE_CHANNEL_TYPE_UNORM_INT16, 2}, + {CU_AD_FORMAT_UNORM_INT16X2, 4}}, + {{UR_IMAGE_CHANNEL_TYPE_UNORM_INT16, 4}, + {CU_AD_FORMAT_UNORM_INT16X4, 8}}, + + {{UR_IMAGE_CHANNEL_TYPE_SNORM_INT16, 1}, + {CU_AD_FORMAT_SNORM_INT16X1, 2}}, + {{UR_IMAGE_CHANNEL_TYPE_SNORM_INT16, 2}, + {CU_AD_FORMAT_SNORM_INT16X2, 4}}, + {{UR_IMAGE_CHANNEL_TYPE_SNORM_INT16, 4}, + {CU_AD_FORMAT_SNORM_INT16X4, 8}}, + }; + + try { + auto cuda_format_and_size = norm_channel_type_map.at( + std::make_pair(image_channel_type, num_channels)); + cuda_format = cuda_format_and_size.first; + pixel_size_bytes = cuda_format_and_size.second; + } catch (std::out_of_range &e) { + return UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED; + } + } + +#endif + + if (return_cuda_format) { + *return_cuda_format = cuda_format; + } + if (return_pixel_size_bytes) { + *return_pixel_size_bytes = pixel_size_bytes; + } + return UR_RESULT_SUCCESS; +} + +ur_result_t +cudaToUrImageChannelFormat(CUarray_format cuda_format, + ur_image_channel_type_t *return_image_channel_type) { + + switch (cuda_format) { +#define CUDA_TO_UR_IMAGE_CHANNEL_TYPE(FROM, TO) \ + case FROM: { \ + *return_image_channel_type = TO; \ + return UR_RESULT_SUCCESS; \ + } + CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_UNSIGNED_INT8, + UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8); + CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_UNSIGNED_INT16, + UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16); + CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_UNSIGNED_INT32, + UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32); + CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_SIGNED_INT8, + UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8); + CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_SIGNED_INT16, + UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16); + CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_SIGNED_INT32, + UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32); + CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_HALF, + UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT); + CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_FLOAT, + UR_IMAGE_CHANNEL_TYPE_FLOAT); +#if CUDA_VERSION >= 11050 + + // Note that the CUDA UNORM and SNORM formats also encode the number of + // channels. + // Since UR does not encode this, we map different CUDA formats to the same + // UR channel type. + // Since this function is only called from `urBindlessImagesImageGetInfoExp` + // which has access to `CUDA_ARRAY3D_DESCRIPTOR`, we can determine the + // number of channels in the calling function. + + CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_UNORM_INT8X1, + UR_IMAGE_CHANNEL_TYPE_UNORM_INT8); + CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_UNORM_INT8X2, + UR_IMAGE_CHANNEL_TYPE_UNORM_INT8); + CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_UNORM_INT8X4, + UR_IMAGE_CHANNEL_TYPE_UNORM_INT8); + + CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_UNORM_INT16X1, + UR_IMAGE_CHANNEL_TYPE_UNORM_INT16); + CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_UNORM_INT16X2, + UR_IMAGE_CHANNEL_TYPE_UNORM_INT16); + CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_UNORM_INT16X4, + UR_IMAGE_CHANNEL_TYPE_UNORM_INT16); + + CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_SNORM_INT8X1, + UR_IMAGE_CHANNEL_TYPE_SNORM_INT8); + CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_SNORM_INT8X2, + UR_IMAGE_CHANNEL_TYPE_SNORM_INT8); + CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_SNORM_INT8X4, + UR_IMAGE_CHANNEL_TYPE_SNORM_INT8); + + CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_SNORM_INT16X1, + UR_IMAGE_CHANNEL_TYPE_SNORM_INT16); + CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_SNORM_INT16X2, + UR_IMAGE_CHANNEL_TYPE_SNORM_INT16); + CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_SNORM_INT16X4, + UR_IMAGE_CHANNEL_TYPE_SNORM_INT16); +#endif +#undef MAP + default: + return UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED; + } +} + +ur_result_t urTextureCreate(ur_sampler_handle_t hSampler, + const ur_image_desc_t *pImageDesc, + CUDA_RESOURCE_DESC ResourceDesc, + ur_exp_image_handle_t *phRetImage) { + + try { + /// pi_sampler_properties + /// | | + /// ----------------------------------- + /// | 31 30 ... 6 | N/A + /// | 5 | mip filter mode + /// | 4 3 2 | addressing mode + /// | 1 | filter mode + /// | 0 | normalize coords + CUDA_TEXTURE_DESC ImageTexDesc = {}; + CUaddress_mode AddrMode = {}; + ur_sampler_addressing_mode_t AddrModeProp = hSampler->getAddressingMode(); + if (AddrModeProp == (UR_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE - + UR_SAMPLER_ADDRESSING_MODE_NONE)) { + AddrMode = CU_TR_ADDRESS_MODE_CLAMP; + } else if (AddrModeProp == (UR_SAMPLER_ADDRESSING_MODE_CLAMP - + UR_SAMPLER_ADDRESSING_MODE_NONE)) { + AddrMode = CU_TR_ADDRESS_MODE_BORDER; + } else if (AddrModeProp == (UR_SAMPLER_ADDRESSING_MODE_REPEAT - + UR_SAMPLER_ADDRESSING_MODE_NONE)) { + AddrMode = CU_TR_ADDRESS_MODE_WRAP; + } else if (AddrModeProp == (UR_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT - + UR_SAMPLER_ADDRESSING_MODE_NONE)) { + AddrMode = CU_TR_ADDRESS_MODE_MIRROR; + } + CUfilter_mode FilterMode; + ur_sampler_filter_mode_t FilterModeProp = hSampler->getFilterMode(); + FilterMode = + FilterModeProp ? CU_TR_FILTER_MODE_LINEAR : CU_TR_FILTER_MODE_POINT; + ImageTexDesc.filterMode = FilterMode; + + // Mipmap attributes + CUfilter_mode MipFilterMode; + ur_sampler_filter_mode_t MipFilterModeProp = hSampler->getMipFilterMode(); + MipFilterMode = + MipFilterModeProp ? CU_TR_FILTER_MODE_LINEAR : CU_TR_FILTER_MODE_POINT; + ImageTexDesc.mipmapFilterMode = MipFilterMode; + ImageTexDesc.maxMipmapLevelClamp = hSampler->MaxMipmapLevelClamp; + ImageTexDesc.minMipmapLevelClamp = hSampler->MinMipmapLevelClamp; + ImageTexDesc.maxAnisotropy = hSampler->MaxAnisotropy; + + // The address modes can interfere with other dimensionsenqueueEventsWait + // e.g. 1D texture sampling can be interfered with when setting other + // dimension address modes despite their nonexistence. + ImageTexDesc.addressMode[0] = AddrMode; // 1D + ImageTexDesc.addressMode[1] = + pImageDesc->height > 0 ? AddrMode : ImageTexDesc.addressMode[1]; // 2D + ImageTexDesc.addressMode[2] = + pImageDesc->depth > 0 ? AddrMode : ImageTexDesc.addressMode[2]; // 3D + + // flags takes the normalized coordinates setting -- unnormalized is default + ImageTexDesc.flags = (hSampler->isNormalizedCoords()) + ? CU_TRSF_NORMALIZED_COORDINATES + : ImageTexDesc.flags; + + // CUDA default promotes 8-bit and 16-bit integers to float between [0,1] + // This flag prevents this behaviour. + ImageTexDesc.flags |= CU_TRSF_READ_AS_INTEGER; + + CUtexObject Texture; + UR_CHECK_ERROR( + cuTexObjectCreate(&Texture, &ResourceDesc, &ImageTexDesc, nullptr)); + *phRetImage = (ur_exp_image_handle_t)Texture; + } catch (ur_result_t Err) { + return Err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urUSMPitchedAllocExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool, + size_t widthInBytes, size_t height, size_t elementSizeBytes, void **ppMem, + size_t *pResultPitch) { + UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), + UR_RESULT_ERROR_INVALID_CONTEXT); + std::ignore = pUSMDesc; + std::ignore = pool; + + UR_ASSERT((widthInBytes > 0), UR_RESULT_ERROR_INVALID_VALUE); + UR_ASSERT((height > 0), UR_RESULT_ERROR_INVALID_VALUE); + UR_ASSERT((elementSizeBytes > 0), UR_RESULT_ERROR_INVALID_VALUE); + + // elementSizeBytes can only take on values of 4, 8, or 16. + // small data types need to be minimised to 4. + if (elementSizeBytes < 4) { + elementSizeBytes = 4; + } + UR_ASSERT((elementSizeBytes == 4 || elementSizeBytes == 8 || + elementSizeBytes == 16), + UR_RESULT_ERROR_INVALID_VALUE); + ur_result_t Result = UR_RESULT_SUCCESS; + try { + ScopedContext Active(hDevice->getContext()); + UR_CHECK_ERROR(cuMemAllocPitch((CUdeviceptr *)ppMem, pResultPitch, + widthInBytes, height, elementSizeBytes)); + } catch (ur_result_t error) { + Result = error; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + return Result; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urBindlessImagesUnsampledImageHandleDestroyExp(ur_context_handle_t hContext, + ur_device_handle_t hDevice, + ur_exp_image_handle_t hImage) { + UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), + UR_RESULT_ERROR_INVALID_CONTEXT); + + UR_CHECK_ERROR(cuSurfObjectDestroy((CUsurfObject)hImage)); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urBindlessImagesSampledImageHandleDestroyExp(ur_context_handle_t hContext, + ur_device_handle_t hDevice, + ur_exp_image_handle_t hImage) { + UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), + UR_RESULT_ERROR_INVALID_CONTEXT); + + UR_CHECK_ERROR(cuTexObjectDestroy((CUtexObject)hImage)); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageAllocateExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, + ur_exp_image_mem_handle_t *phImageMem) { + UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), + UR_RESULT_ERROR_INVALID_CONTEXT); + + // Populate descriptor + CUDA_ARRAY3D_DESCRIPTOR array_desc = {}; + + UR_CHECK_ERROR(urCalculateNumChannels(pImageFormat->channelOrder, + &array_desc.NumChannels)); + + UR_CHECK_ERROR(urToCudaImageChannelFormat(pImageFormat->channelType, + pImageFormat->channelOrder, + &array_desc.Format, nullptr)); + + array_desc.Flags = 0; // No flags required + array_desc.Width = pImageDesc->width; + if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) { + array_desc.Height = 0; + array_desc.Depth = 0; + } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) { + array_desc.Height = pImageDesc->height; + array_desc.Depth = 0; + } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) { + array_desc.Height = pImageDesc->height; + array_desc.Depth = pImageDesc->depth; + } + + ScopedContext Active(hDevice->getContext()); + + // Allocate a cuArray + if (pImageDesc->numMipLevel == 1) { + CUarray ImageArray; + + try { + UR_CHECK_ERROR(cuArray3DCreate(&ImageArray, &array_desc)); + *phImageMem = (ur_exp_image_mem_handle_t)ImageArray; + } catch (ur_result_t Err) { + cuArrayDestroy(ImageArray); + return Err; + } catch (...) { + cuArrayDestroy(ImageArray); + return UR_RESULT_ERROR_UNKNOWN; + } + } else // Allocate a cuMipmappedArray + { + CUmipmappedArray mip_array; + array_desc.Flags = CUDA_ARRAY3D_SURFACE_LDST; + + try { + UR_CHECK_ERROR(cuMipmappedArrayCreate(&mip_array, &array_desc, + pImageDesc->numMipLevel)); + *phImageMem = (ur_exp_image_mem_handle_t)mip_array; + } catch (ur_result_t Err) { + cuMipmappedArrayDestroy(mip_array); + return Err; + } catch (...) { + cuMipmappedArrayDestroy(mip_array); + return UR_RESULT_ERROR_UNKNOWN; + } + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageFreeExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + ur_exp_image_mem_handle_t hImageMem) { + UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), + UR_RESULT_ERROR_INVALID_CONTEXT); + + ScopedContext Active(hDevice->getContext()); + try { + UR_CHECK_ERROR(cuArrayDestroy((CUarray)hImageMem)); + } catch (ur_result_t Err) { + return Err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + ur_exp_image_mem_handle_t hImageMem, const ur_image_format_t *pImageFormat, + const ur_image_desc_t *pImageDesc, ur_mem_handle_t *phMem, + ur_exp_image_handle_t *phImage) { + UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), + UR_RESULT_ERROR_INVALID_CONTEXT); + + unsigned int NumChannels = 0; + UR_CHECK_ERROR( + urCalculateNumChannels(pImageFormat->channelOrder, &NumChannels)); + + CUarray_format format; + size_t PixelSizeBytes; + UR_CHECK_ERROR(urToCudaImageChannelFormat(pImageFormat->channelType, + pImageFormat->channelOrder, &format, + &PixelSizeBytes)); + + try { + + ScopedContext Active(hDevice->getContext()); + + CUDA_RESOURCE_DESC image_res_desc = {}; + + // We have a CUarray + image_res_desc.resType = CU_RESOURCE_TYPE_ARRAY; + image_res_desc.res.array.hArray = (CUarray)hImageMem; + + // We create surfaces in the unsampled images case as it conforms to how + // CUDA deals with unsampled images. + CUsurfObject surface; + UR_CHECK_ERROR(cuSurfObjectCreate(&surface, &image_res_desc)); + *phImage = (ur_exp_image_handle_t)surface; + + auto urMemObj = std::unique_ptr(new ur_mem_handle_t_{ + hContext, (CUarray)hImageMem, surface, pImageDesc->type}); + + if (urMemObj == nullptr) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + *phMem = urMemObj.release(); + + } catch (ur_result_t Err) { + return Err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + ur_exp_image_mem_handle_t hImageMem, const ur_image_format_t *pImageFormat, + const ur_image_desc_t *pImageDesc, ur_sampler_handle_t hSampler, + ur_mem_handle_t *phMem, ur_exp_image_handle_t *phImage) { + UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), + UR_RESULT_ERROR_INVALID_CONTEXT); + + ScopedContext Active(hDevice->getContext()); + + unsigned int NumChannels = 0; + UR_CHECK_ERROR( + urCalculateNumChannels(pImageFormat->channelOrder, &NumChannels)); + + CUarray_format format; + size_t PixelSizeBytes; + UR_CHECK_ERROR(urToCudaImageChannelFormat(pImageFormat->channelType, + pImageFormat->channelOrder, &format, + &PixelSizeBytes)); + + try { + CUDA_RESOURCE_DESC image_res_desc = {}; + + unsigned int mem_type; + // If this function doesn't return successfully, we assume that hImageMem is + // a CUarray or CUmipmappedArray. If this function returns successfully, we + // check whether hImageMem is device memory (even managed memory isn't + // considered shared). + CUresult Err = cuPointerGetAttribute( + &mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)hImageMem); + if (Err != CUDA_SUCCESS) { + // We have a CUarray + if (pImageDesc->numMipLevel == 1) { + image_res_desc.resType = CU_RESOURCE_TYPE_ARRAY; + image_res_desc.res.array.hArray = (CUarray)hImageMem; + } + // We have a CUmipmappedArray + else { + image_res_desc.resType = CU_RESOURCE_TYPE_MIPMAPPED_ARRAY; + image_res_desc.res.mipmap.hMipmappedArray = (CUmipmappedArray)hImageMem; + } + } else if (mem_type == CU_MEMORYTYPE_DEVICE) { + // We have a USM pointer + if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) { + image_res_desc.resType = CU_RESOURCE_TYPE_LINEAR; + image_res_desc.res.linear.devPtr = (CUdeviceptr)hImageMem; + image_res_desc.res.linear.format = format; + image_res_desc.res.linear.numChannels = NumChannels; + image_res_desc.res.linear.sizeInBytes = + pImageDesc->width * PixelSizeBytes; + } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) { + image_res_desc.resType = CU_RESOURCE_TYPE_PITCH2D; + image_res_desc.res.pitch2D.devPtr = (CUdeviceptr)hImageMem; + image_res_desc.res.pitch2D.format = format; + image_res_desc.res.pitch2D.numChannels = NumChannels; + image_res_desc.res.pitch2D.width = pImageDesc->width; + image_res_desc.res.pitch2D.height = pImageDesc->height; + image_res_desc.res.pitch2D.pitchInBytes = pImageDesc->rowPitch; + } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) { + // Cannot create 3D image from USM. + return UR_RESULT_ERROR_INVALID_VALUE; + } + } else { + // Unknown image memory type. + return UR_RESULT_ERROR_INVALID_VALUE; + } + + UR_CHECK_ERROR( + urTextureCreate(hSampler, pImageDesc, image_res_desc, phImage)); + + auto urMemObj = std::unique_ptr(new ur_mem_handle_t_{ + hContext, (CUarray)hImageMem, (CUtexObject)*phImage, hSampler, + pImageDesc->type}); + + if (urMemObj == nullptr) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + *phMem = urMemObj.release(); + } catch (ur_result_t Err) { + return Err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( + ur_queue_handle_t hQueue, void *pDst, void *pSrc, + const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, + ur_exp_image_copy_flags_t imageCopyFlags, ur_rect_offset_t srcOffset, + ur_rect_offset_t dstOffset, ur_rect_region_t copyExtent, + ur_rect_region_t hostExtent, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + UR_ASSERT((imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_HOST_TO_DEVICE || + imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_HOST || + imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_DEVICE), + UR_RESULT_ERROR_INVALID_VALUE); + + unsigned int NumChannels = 0; + size_t PixelSizeBytes = 0; + + UR_CHECK_ERROR( + urCalculateNumChannels(pImageFormat->channelOrder, &NumChannels)); + + // We need to get this now in bytes for calculating the total image size + // later. + UR_CHECK_ERROR(urToCudaImageChannelFormat(pImageFormat->channelType, + pImageFormat->channelOrder, nullptr, + &PixelSizeBytes)); + + try { + ScopedContext Active(hQueue->getContext()); + CUstream Stream = hQueue->getNextTransferStream(); + enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList); + // We have to use a different copy function for each image dimensionality. + + if (imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_HOST_TO_DEVICE) { + if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) { + size_t CopyExtentBytes = PixelSizeBytes * copyExtent.width; + char *SrcWithOffset = (char *)pSrc + (srcOffset.x * PixelSizeBytes); + UR_CHECK_ERROR( + cuMemcpyHtoAAsync((CUarray)pDst, dstOffset.x * PixelSizeBytes, + (void *)SrcWithOffset, CopyExtentBytes, Stream)); + } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) { + CUDA_MEMCPY2D cpy_desc = {}; + cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; + cpy_desc.srcHost = pSrc; + cpy_desc.srcXInBytes = srcOffset.x * PixelSizeBytes; + cpy_desc.srcY = srcOffset.y; + cpy_desc.dstXInBytes = dstOffset.x * PixelSizeBytes; + cpy_desc.dstY = dstOffset.y; + cpy_desc.srcPitch = hostExtent.width * PixelSizeBytes; + if (pImageDesc->rowPitch == 0) { + cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; + cpy_desc.dstArray = (CUarray)pDst; + } else { + // Pitched memory + cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_DEVICE; + cpy_desc.dstDevice = (CUdeviceptr)pDst; + cpy_desc.dstPitch = pImageDesc->rowPitch; + } + cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width; + cpy_desc.Height = copyExtent.height; + UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream)); + } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) { + CUDA_MEMCPY3D cpy_desc = {}; + cpy_desc.srcXInBytes = srcOffset.x * PixelSizeBytes; + cpy_desc.srcY = srcOffset.y; + cpy_desc.srcZ = srcOffset.z; + cpy_desc.dstXInBytes = dstOffset.x * PixelSizeBytes; + cpy_desc.dstY = dstOffset.y; + cpy_desc.dstZ = dstOffset.z; + cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; + cpy_desc.srcHost = pSrc; + cpy_desc.srcPitch = hostExtent.width * PixelSizeBytes; + cpy_desc.srcHeight = hostExtent.height; + cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; + cpy_desc.dstArray = (CUarray)pDst; + cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width; + cpy_desc.Height = copyExtent.height; + cpy_desc.Depth = copyExtent.depth; + UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream)); + } + } else if (imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_HOST) { + if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) { + size_t CopyExtentBytes = PixelSizeBytes * copyExtent.width; + size_t src_offset_bytes = PixelSizeBytes * srcOffset.x; + void *dst_with_offset = + (void *)((char *)pDst + (PixelSizeBytes * dstOffset.x)); + UR_CHECK_ERROR(cuMemcpyAtoHAsync(dst_with_offset, (CUarray)pSrc, + src_offset_bytes, CopyExtentBytes, + Stream)); + } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) { + CUDA_MEMCPY2D cpy_desc = {}; + cpy_desc.srcXInBytes = srcOffset.x; + cpy_desc.srcY = srcOffset.y; + cpy_desc.dstXInBytes = dstOffset.x; + cpy_desc.dstY = dstOffset.y; + cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; + cpy_desc.dstHost = pDst; + if (pImageDesc->rowPitch == 0) { + cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; + cpy_desc.srcArray = (CUarray)pSrc; + } else { + // Pitched memory + cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_DEVICE; + cpy_desc.srcPitch = pImageDesc->rowPitch; + cpy_desc.srcDevice = (CUdeviceptr)pSrc; + } + cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width; + cpy_desc.Height = copyExtent.height; + UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream)); + } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) { + CUDA_MEMCPY3D cpy_desc = {}; + cpy_desc.srcXInBytes = srcOffset.x; + cpy_desc.srcY = srcOffset.y; + cpy_desc.srcZ = srcOffset.z; + cpy_desc.dstXInBytes = dstOffset.x; + cpy_desc.dstY = dstOffset.y; + cpy_desc.dstZ = dstOffset.z; + cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; + cpy_desc.srcArray = (CUarray)pSrc; + cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; + cpy_desc.dstHost = pDst; + cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width; + cpy_desc.Height = copyExtent.height; + cpy_desc.Depth = copyExtent.depth; + UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream)); + } + } else { + /// imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_DEVICE + /// TODO: implemet device to device copy + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + if (phEvent) { + auto NewEvent = ur_event_handle_t_::makeNative(UR_COMMAND_MEM_IMAGE_COPY, + hQueue, Stream); + NewEvent->record(); + *phEvent = NewEvent; + } + } catch (ur_result_t Err) { + return Err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageGetInfoExp( + ur_exp_image_mem_handle_t hImageMem, ur_image_info_t propName, + void *pPropValue, size_t *pPropSizeRet) { + + CUDA_ARRAY3D_DESCRIPTOR ArrayDesc; + UR_CHECK_ERROR(cuArray3DGetDescriptor(&ArrayDesc, (CUarray)hImageMem)); + switch (propName) { + case UR_IMAGE_INFO_WIDTH: + if (pPropValue) { + *(size_t *)pPropValue = ArrayDesc.Width; + } + if (pPropSizeRet) { + *pPropSizeRet = sizeof(size_t); + } + return UR_RESULT_SUCCESS; + case UR_IMAGE_INFO_HEIGHT: + if (pPropValue) { + *(size_t *)pPropValue = ArrayDesc.Height; + } + if (pPropSizeRet) { + *pPropSizeRet = sizeof(size_t); + } + return UR_RESULT_SUCCESS; + case UR_IMAGE_INFO_DEPTH: + if (pPropValue) { + *(size_t *)pPropValue = ArrayDesc.Depth; + } + if (pPropSizeRet) { + *pPropSizeRet = sizeof(size_t); + } + return UR_RESULT_SUCCESS; + case UR_IMAGE_INFO_FORMAT: + ur_image_channel_type_t ChannelType; + ur_image_channel_order_t ChannelOrder; + UR_CHECK_ERROR(cudaToUrImageChannelFormat(ArrayDesc.Format, &ChannelType)); + // CUDA does not have a notion of channel "order" in the same way that + // SYCL 1.2.1 does. + switch (ArrayDesc.NumChannels) { + case 1: + ChannelOrder = UR_IMAGE_CHANNEL_ORDER_R; + break; + case 2: + ChannelOrder = UR_IMAGE_CHANNEL_ORDER_RG; + break; + case 4: + ChannelOrder = UR_IMAGE_CHANNEL_ORDER_RGBA; + break; + } + if (pPropValue) { + ((ur_image_format_t *)pPropValue)->channelType = ChannelType; + ((ur_image_format_t *)pPropValue)->channelOrder = ChannelOrder; + } + if (pPropSizeRet) { + *pPropSizeRet = sizeof(ur_image_format_t); + } + return UR_RESULT_SUCCESS; + default: + return UR_RESULT_ERROR_INVALID_VALUE; + } +} + +UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMipmapGetLevelExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + ur_exp_image_mem_handle_t hImageMem, uint32_t mipmapLevel, + ur_exp_image_mem_handle_t *phImageMem) { + UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), + UR_RESULT_ERROR_INVALID_CONTEXT); + + try { + ScopedContext Active(hDevice->getContext()); + CUarray ImageArray; + UR_CHECK_ERROR(cuMipmappedArrayGetLevel( + &ImageArray, (CUmipmappedArray)hImageMem, mipmapLevel)); + *phImageMem = (ur_exp_image_mem_handle_t)ImageArray; + } catch (ur_result_t Err) { + return Err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMipmapFreeExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + ur_exp_image_mem_handle_t hMem) { + UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), + UR_RESULT_ERROR_INVALID_CONTEXT); + + ScopedContext Active(hDevice->getContext()); + try { + UR_CHECK_ERROR(cuMipmappedArrayDestroy((CUmipmappedArray)hMem)); + } catch (ur_result_t Err) { + return Err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImportOpaqueFDExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size, + ur_exp_interop_mem_desc_t *pInteropMemDesc, + ur_exp_interop_mem_handle_t *phInteropMem) { + UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), + UR_RESULT_ERROR_INVALID_CONTEXT); + + try { + ScopedContext Active(hDevice->getContext()); + + CUDA_EXTERNAL_MEMORY_HANDLE_DESC extMemDesc = {}; + extMemDesc.size = size; + + void *pNext = const_cast(pInteropMemDesc->pNext); + while (pNext != nullptr) { + const ur_base_desc_t *BaseDesc = + reinterpret_cast(pNext); + if (BaseDesc->stype == UR_STRUCTURE_TYPE_EXP_FILE_DESCRIPTOR) { + const ur_exp_file_descriptor_t *FileDescriptor = + reinterpret_cast(pNext); + + extMemDesc.handle.fd = FileDescriptor->fd; + extMemDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD; + } else if (BaseDesc->stype == UR_STRUCTURE_TYPE_EXP_WIN32_HANDLE) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + pNext = const_cast(BaseDesc->pNext); + } + + CUexternalMemory extMem; + UR_CHECK_ERROR(cuImportExternalMemory(&extMem, &extMemDesc)); + *phInteropMem = (ur_exp_interop_mem_handle_t)extMem; + + } catch (ur_result_t Err) { + return Err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, + ur_exp_interop_mem_handle_t hInteropMem, + ur_exp_image_mem_handle_t *phImageMem) { + UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), + UR_RESULT_ERROR_INVALID_CONTEXT); + + unsigned int NumChannels = 0; + UR_CHECK_ERROR( + urCalculateNumChannels(pImageFormat->channelOrder, &NumChannels)); + + CUarray_format format; + UR_CHECK_ERROR(urToCudaImageChannelFormat( + pImageFormat->channelType, pImageFormat->channelOrder, &format, nullptr)); + + try { + ScopedContext Active(hDevice->getContext()); + + CUDA_ARRAY3D_DESCRIPTOR ArrayDesc = {}; + ArrayDesc.Width = pImageDesc->width; + ArrayDesc.Height = pImageDesc->height; + ArrayDesc.Depth = pImageDesc->depth; + ArrayDesc.NumChannels = NumChannels; + ArrayDesc.Format = format; + + CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC mipmapDesc = {}; + mipmapDesc.numLevels = 1; + mipmapDesc.arrayDesc = ArrayDesc; + + CUmipmappedArray memMipMap; + UR_CHECK_ERROR(cuExternalMemoryGetMappedMipmappedArray( + &memMipMap, (CUexternalMemory)hInteropMem, &mipmapDesc)); + + CUarray memArray; + UR_CHECK_ERROR(cuMipmappedArrayGetLevel(&memArray, memMipMap, 0)); + + *phImageMem = (ur_exp_image_mem_handle_t)memArray; + + } catch (ur_result_t Err) { + return Err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseInteropExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + ur_exp_interop_mem_handle_t hInteropMem) { + UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), + UR_RESULT_ERROR_INVALID_CONTEXT); + + try { + ScopedContext Active(hDevice->getContext()); + UR_CHECK_ERROR(cuDestroyExternalMemory((CUexternalMemory)hInteropMem)); + } catch (ur_result_t Err) { + return Err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urBindlessImagesImportExternalSemaphoreOpaqueFDExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + ur_exp_interop_semaphore_desc_t *pInteropSemaphoreDesc, + ur_exp_interop_semaphore_handle_t *phInteropSemaphoreHandle) { + UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), + UR_RESULT_ERROR_INVALID_CONTEXT); + + try { + ScopedContext Active(hDevice->getContext()); + + CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC extSemDesc = {}; + + void *pNext = const_cast(pInteropSemaphoreDesc->pNext); + while (pNext != nullptr) { + const ur_base_desc_t *BaseDesc = + reinterpret_cast(pNext); + if (BaseDesc->stype == UR_STRUCTURE_TYPE_EXP_FILE_DESCRIPTOR) { + const ur_exp_file_descriptor_t *FileDescriptor = + reinterpret_cast(pNext); + + extSemDesc.handle.fd = FileDescriptor->fd; + extSemDesc.type = CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD; + } else if (BaseDesc->stype == UR_STRUCTURE_TYPE_EXP_WIN32_HANDLE) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + pNext = const_cast(BaseDesc->pNext); + } + + CUexternalSemaphore semaphore; + UR_CHECK_ERROR(cuImportExternalSemaphore(&semaphore, &extSemDesc)); + + *phInteropSemaphoreHandle = (ur_exp_interop_semaphore_handle_t)semaphore; + } catch (ur_result_t Err) { + return Err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesDestroyExternalSemaphoreExp( + ur_context_handle_t hContext, ur_device_handle_t hDevice, + ur_exp_interop_semaphore_handle_t hInteropSemaphore) { + UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), + UR_RESULT_ERROR_INVALID_CONTEXT); + + try { + ScopedContext Active(hDevice->getContext()); + UR_CHECK_ERROR( + cuDestroyExternalSemaphore((CUexternalSemaphore)hInteropSemaphore)); + } catch (ur_result_t Err) { + return Err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( + ur_queue_handle_t hQueue, ur_exp_interop_semaphore_handle_t hSemaphore, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + + try { + ScopedContext Active(hQueue->getContext()); + CUstream Stream = hQueue->getNextTransferStream(); + + enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList); + + CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS SemWaitParams = {}; + + // Wait for one external semaphore + UR_CHECK_ERROR(cuWaitExternalSemaphoresAsync( + (CUexternalSemaphore *)&hSemaphore, &SemWaitParams, 1 /* numExtSems */, + Stream)); + + if (phEvent) { + auto NewEvent = ur_event_handle_t_::makeNative( + UR_COMMAND_INTEROP_SEMAPHORE_WAIT_EXP, hQueue, Stream); + NewEvent->record(); + *phEvent = NewEvent; + } + } catch (ur_result_t Err) { + return Err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( + ur_queue_handle_t hQueue, ur_exp_interop_semaphore_handle_t hSemaphore, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + + try { + ScopedContext Active(hQueue->getContext()); + CUstream Stream = hQueue->getNextTransferStream(); + + enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList); + + CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS SemSignalParams = {}; + + // Signal one external semaphore + UR_CHECK_ERROR(cuSignalExternalSemaphoresAsync( + (CUexternalSemaphore *)&hSemaphore, &SemSignalParams, + 1 /* numExtSems */, Stream)); + + if (phEvent) { + auto NewEvent = ur_event_handle_t_::makeNative( + UR_COMMAND_INTEROP_SEMAPHORE_SIGNAL_EXP, hQueue, Stream); + NewEvent->record(); + *phEvent = NewEvent; + } + } catch (ur_result_t Err) { + return Err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + return UR_RESULT_SUCCESS; +} diff --git a/source/adapters/cuda/image.hpp b/source/adapters/cuda/image.hpp new file mode 100644 index 0000000000..af1d9fd194 --- /dev/null +++ b/source/adapters/cuda/image.hpp @@ -0,0 +1,32 @@ +//===--------- image.hpp - CUDA Adapter -----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#pragma once + +#include +#include +#include + +#include "common.hpp" +ur_result_t urCalculateNumChannels(ur_image_channel_order_t order, + unsigned int *num_channels); + +ur_result_t +urToCudaImageChannelFormat(ur_image_channel_type_t image_channel_type, + ur_image_channel_order_t image_channel_order, + CUarray_format *return_cuda_format, + size_t *return_pixel_types_size_bytes); + +ur_result_t +cudaToUrImageChannelFormat(CUarray_format cuda_format, + ur_image_channel_type_t *return_image_channel_type); + +ur_result_t urTextureCreate(ur_context_handle_t hContext, + ur_sampler_desc_t SamplerDesc, + const ur_image_desc_t *pImageDesc, + CUDA_RESOURCE_DESC ResourceDesc, + ur_exp_image_handle_t *phRetImage); diff --git a/source/adapters/cuda/kernel.cpp b/source/adapters/cuda/kernel.cpp new file mode 100644 index 0000000000..e2fa09e4dd --- /dev/null +++ b/source/adapters/cuda/kernel.cpp @@ -0,0 +1,374 @@ +//===--------- kernel.cpp - CUDA Adapter ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "kernel.hpp" +#include "memory.hpp" +#include "sampler.hpp" + +UR_APIEXPORT ur_result_t UR_APICALL +urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName, + ur_kernel_handle_t *phKernel) { + ur_result_t Result = UR_RESULT_SUCCESS; + std::unique_ptr Kernel{nullptr}; + + try { + ScopedContext Active(hProgram->getContext()); + + CUfunction CuFunc; + CUresult FunctionResult = + cuModuleGetFunction(&CuFunc, hProgram->get(), pKernelName); + + // We can't add this as a generic mapping in UR_CHECK_ERROR since cuda's + // NOT_FOUND error applies to more than just functions. + if (FunctionResult == CUDA_ERROR_NOT_FOUND) { + throw UR_RESULT_ERROR_INVALID_KERNEL_NAME; + } else { + UR_CHECK_ERROR(FunctionResult); + } + + std::string KernelNameWithOffset = + std::string(pKernelName) + "_with_offset"; + CUfunction CuFuncWithOffsetParam; + CUresult OffsetRes = cuModuleGetFunction( + &CuFuncWithOffsetParam, hProgram->get(), KernelNameWithOffset.c_str()); + + // If there is no kernel with global offset parameter we mark it as missing + if (OffsetRes == CUDA_ERROR_NOT_FOUND) { + CuFuncWithOffsetParam = nullptr; + } else { + UR_CHECK_ERROR(OffsetRes); + } + Kernel = std::unique_ptr( + new ur_kernel_handle_t_{CuFunc, CuFuncWithOffsetParam, pKernelName, + hProgram, hProgram->getContext()}); + } catch (ur_result_t Err) { + Result = Err; + } catch (...) { + Result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + *phKernel = Kernel.release(); + return Result; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, + ur_kernel_group_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet) { + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + switch (propName) { + case UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: { + size_t GlobalWorkSize[3] = {0, 0, 0}; + + int MaxBlockDimX{0}, MaxBlockDimY{0}, MaxBlockDimZ{0}; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &MaxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, hDevice->get())); + UR_CHECK_ERROR(cuDeviceGetAttribute( + &MaxBlockDimY, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, hDevice->get())); + UR_CHECK_ERROR(cuDeviceGetAttribute( + &MaxBlockDimZ, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, hDevice->get())); + + int MaxGridDimX{0}, MaxGridDimY{0}, MaxGridDimZ{0}; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, hDevice->get())); + UR_CHECK_ERROR(cuDeviceGetAttribute( + &MaxGridDimY, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, hDevice->get())); + UR_CHECK_ERROR(cuDeviceGetAttribute( + &MaxGridDimZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, hDevice->get())); + + GlobalWorkSize[0] = MaxBlockDimX * MaxGridDimX; + GlobalWorkSize[1] = MaxBlockDimY * MaxGridDimY; + GlobalWorkSize[2] = MaxBlockDimZ * MaxGridDimZ; + return ReturnValue(GlobalWorkSize, 3); + } + case UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { + int MaxThreads = 0; + UR_CHECK_ERROR(cuFuncGetAttribute( + &MaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, hKernel->get())); + return ReturnValue(size_t(MaxThreads)); + } + case UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: { + size_t GroupSize[3] = {0, 0, 0}; + const auto &ReqdWGSizeMDMap = + hKernel->get_program()->KernelReqdWorkGroupSizeMD; + const auto ReqdWGSizeMD = ReqdWGSizeMDMap.find(hKernel->getName()); + if (ReqdWGSizeMD != ReqdWGSizeMDMap.end()) { + const auto ReqdWGSize = ReqdWGSizeMD->second; + GroupSize[0] = std::get<0>(ReqdWGSize); + GroupSize[1] = std::get<1>(ReqdWGSize); + GroupSize[2] = std::get<2>(ReqdWGSize); + } + return ReturnValue(GroupSize, 3); + } + case UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: { + // OpenCL LOCAL == CUDA SHARED + int Bytes = 0; + UR_CHECK_ERROR(cuFuncGetAttribute( + &Bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, hKernel->get())); + return ReturnValue(uint64_t(Bytes)); + } + case UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: { + // Work groups should be multiples of the warp size + int WarpSize = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, hDevice->get())); + return ReturnValue(static_cast(WarpSize)); + } + case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: { + // OpenCL PRIVATE == CUDA LOCAL + int Bytes = 0; + UR_CHECK_ERROR(cuFuncGetAttribute( + &Bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, hKernel->get())); + return ReturnValue(uint64_t(Bytes)); + } + default: + break; + } + + return UR_RESULT_ERROR_INVALID_ENUMERATION; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelRetain(ur_kernel_handle_t hKernel) { + UR_ASSERT(hKernel->getReferenceCount() > 0u, UR_RESULT_ERROR_INVALID_KERNEL); + + hKernel->incrementReferenceCount(); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urKernelRelease(ur_kernel_handle_t hKernel) { + // double delete or someone is messing with the ref count. + // either way, cannot safely proceed. + UR_ASSERT(hKernel->getReferenceCount() != 0, UR_RESULT_ERROR_INVALID_KERNEL); + + // decrement ref count. If it is 0, delete the program. + if (hKernel->decrementReferenceCount() == 0) { + // no internal cuda resources to clean up. Just delete it. + delete hKernel; + return UR_RESULT_SUCCESS; + } + + return UR_RESULT_SUCCESS; +} + +// TODO(ur): Not implemented on cuda atm. Also, need to add tests for this +// feature. +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle( + ur_kernel_handle_t hKernel, ur_native_handle_t *phNativeKernel) { + (void)hKernel; + (void)phNativeKernel; + + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue( + ur_kernel_handle_t hKernel, uint32_t argIndex, size_t argSize, + const ur_kernel_arg_value_properties_t *pProperties, + const void *pArgValue) { + std::ignore = pProperties; + UR_ASSERT(argSize, UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE); + + ur_result_t Result = UR_RESULT_SUCCESS; + try { + hKernel->setKernelArg(argIndex, argSize, pArgValue); + } catch (ur_result_t Err) { + Result = Err; + } + return Result; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgLocal( + ur_kernel_handle_t hKernel, uint32_t argIndex, size_t argSize, + const ur_kernel_arg_local_properties_t *pProperties) { + std::ignore = pProperties; + UR_ASSERT(argSize, UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE); + + ur_result_t Result = UR_RESULT_SUCCESS; + try { + hKernel->setKernelLocalArg(argIndex, argSize); + } catch (ur_result_t Err) { + Result = Err; + } + return Result; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel, + ur_kernel_info_t propName, + size_t propSize, + void *pKernelInfo, + size_t *pPropSizeRet) { + UrReturnHelper ReturnValue(propSize, pKernelInfo, pPropSizeRet); + + switch (propName) { + case UR_KERNEL_INFO_FUNCTION_NAME: + return ReturnValue(hKernel->getName()); + case UR_KERNEL_INFO_NUM_ARGS: + return ReturnValue(hKernel->getNumArgs()); + case UR_KERNEL_INFO_REFERENCE_COUNT: + return ReturnValue(hKernel->getReferenceCount()); + case UR_KERNEL_INFO_CONTEXT: + return ReturnValue(hKernel->getContext()); + case UR_KERNEL_INFO_PROGRAM: + return ReturnValue(hKernel->get_program()); + case UR_KERNEL_INFO_ATTRIBUTES: + return ReturnValue(""); + case UR_KERNEL_INFO_NUM_REGS: { + int NumRegs = 0; + UR_CHECK_ERROR(cuFuncGetAttribute(&NumRegs, CU_FUNC_ATTRIBUTE_NUM_REGS, + hKernel->get())); + return ReturnValue(static_cast(NumRegs)); + } + default: + break; + } + + return UR_RESULT_ERROR_INVALID_ENUMERATION; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, + ur_kernel_sub_group_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet) { + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + switch (propName) { + case UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE: { + // Sub-group size is equivalent to warp size + int WarpSize = 0; + UR_CHECK_ERROR(cuDeviceGetAttribute( + &WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, hDevice->get())); + return ReturnValue(static_cast(WarpSize)); + } + case UR_KERNEL_SUB_GROUP_INFO_MAX_NUM_SUB_GROUPS: { + // Number of sub-groups = max block size / warp size + possible remainder + int MaxThreads = 0; + UR_CHECK_ERROR(cuFuncGetAttribute( + &MaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, hKernel->get())); + int WarpSize = 0; + urKernelGetSubGroupInfo(hKernel, hDevice, + UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE, + sizeof(uint32_t), &WarpSize, nullptr); + int MaxWarps = (MaxThreads + WarpSize - 1) / WarpSize; + return ReturnValue(static_cast(MaxWarps)); + } + case UR_KERNEL_SUB_GROUP_INFO_COMPILE_NUM_SUB_GROUPS: { + // Return value of 0 => not specified + // TODO: Revisit if PTX is generated for compile-time work-group sizes + return ReturnValue(0); + } + case UR_KERNEL_SUB_GROUP_INFO_SUB_GROUP_SIZE_INTEL: { + // Return value of 0 => unspecified or "auto" sub-group size + // Correct for now, since warp size may be read from special register + // TODO: Return warp size once default is primary sub-group size + // TODO: Revisit if we can recover [[sub_group_size]] attribute from PTX + return ReturnValue(0); + } + default: + break; + } + + return UR_RESULT_ERROR_INVALID_ENUMERATION; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urKernelSetArgPointer(ur_kernel_handle_t hKernel, uint32_t argIndex, + const ur_kernel_arg_pointer_properties_t *pProperties, + const void *pArgValue) { + std::ignore = pProperties; + hKernel->setKernelArg(argIndex, sizeof(pArgValue), pArgValue); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex, + const ur_kernel_arg_mem_obj_properties_t *Properties, + ur_mem_handle_t hArgValue) { + std::ignore = Properties; + + // Below sets kernel arg when zero-sized buffers are handled. + // In such case the corresponding memory is null. + if (hArgValue == nullptr) { + hKernel->setKernelArg(argIndex, 0, nullptr); + return UR_RESULT_SUCCESS; + } + + ur_result_t Result = UR_RESULT_SUCCESS; + try { + if (hArgValue->MemType == ur_mem_handle_t_::Type::Surface) { + CUDA_ARRAY3D_DESCRIPTOR arrayDesc; + UR_CHECK_ERROR(cuArray3DGetDescriptor( + &arrayDesc, hArgValue->Mem.SurfaceMem.getArray())); + if (arrayDesc.Format != CU_AD_FORMAT_UNSIGNED_INT32 && + arrayDesc.Format != CU_AD_FORMAT_SIGNED_INT32 && + arrayDesc.Format != CU_AD_FORMAT_HALF && + arrayDesc.Format != CU_AD_FORMAT_FLOAT) { + setErrorMessage("PI CUDA kernels only support images with channel " + "types int32, uint32, float, and half.", + UR_RESULT_ERROR_ADAPTER_SPECIFIC); + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } + CUsurfObject CuSurf = hArgValue->Mem.SurfaceMem.getSurface(); + hKernel->setKernelArg(argIndex, sizeof(CuSurf), (void *)&CuSurf); + } else { + CUdeviceptr CuPtr = hArgValue->Mem.BufferMem.get(); + hKernel->setKernelArg(argIndex, sizeof(CUdeviceptr), (void *)&CuPtr); + } + } catch (ur_result_t Err) { + Result = Err; + } + return Result; +} + +// A NOP for the CUDA backend +UR_APIEXPORT ur_result_t UR_APICALL urKernelSetExecInfo( + ur_kernel_handle_t hKernel, ur_kernel_exec_info_t propName, size_t propSize, + const ur_kernel_exec_info_properties_t *pProperties, + const void *pPropValue) { + std::ignore = hKernel; + std::ignore = propSize; + std::ignore = pPropValue; + std::ignore = pProperties; + + switch (propName) { + case UR_KERNEL_EXEC_INFO_USM_INDIRECT_ACCESS: + case UR_KERNEL_EXEC_INFO_USM_PTRS: + case UR_KERNEL_EXEC_INFO_CACHE_CONFIG: + return UR_RESULT_SUCCESS; + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle( + ur_native_handle_t hNativeKernel, ur_context_handle_t hContext, + ur_program_handle_t hProgram, + const ur_kernel_native_properties_t *pProperties, + ur_kernel_handle_t *phKernel) { + std::ignore = hNativeKernel; + std::ignore = hContext; + std::ignore = hProgram; + std::ignore = pProperties; + std::ignore = phKernel; + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urKernelSetArgSampler(ur_kernel_handle_t hKernel, uint32_t argIndex, + const ur_kernel_arg_sampler_properties_t *pProperties, + ur_sampler_handle_t hArgValue) { + std::ignore = pProperties; + + ur_result_t Result = UR_RESULT_SUCCESS; + try { + uint32_t SamplerProps = hArgValue->Props; + hKernel->setKernelArg(argIndex, sizeof(uint32_t), (void *)&SamplerProps); + } catch (ur_result_t Err) { + Result = Err; + } + return Result; +} diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp new file mode 100644 index 0000000000..ea4e565d3f --- /dev/null +++ b/source/adapters/cuda/kernel.hpp @@ -0,0 +1,206 @@ +//===--------- kernel.hpp - CUDA Adapter ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#pragma once + +#include +#include + +#include +#include +#include +#include + +#include "program.hpp" + +/// Implementation of a UR Kernel for CUDA +/// +/// UR Kernels are used to set kernel arguments, +/// creating a state on the Kernel object for a given +/// invocation. This is not the case of CUFunction objects, +/// which are simply passed together with the arguments on the invocation. +/// The UR Kernel implementation for CUDA stores the list of arguments, +/// argument sizes, and offsets to emulate the interface of UR Kernel, +/// saving the arguments for the later dispatch. +/// Note that in UR API, the Local memory is specified as a size per +/// individual argument, but in CUDA only the total usage of shared +/// memory is required since it is not passed as a parameter. +/// A compiler pass converts the UR API local memory model into the +/// CUDA shared model. This object simply calculates the total of +/// shared memory, and the initial offsets of each parameter. +struct ur_kernel_handle_t_ { + using native_type = CUfunction; + + native_type Function; + native_type FunctionWithOffsetParam; + std::string Name; + ur_context_handle_t Context; + ur_program_handle_t Program; + std::atomic_uint32_t RefCount; + + static constexpr uint32_t ReqdThreadsPerBlockDimensions = 3u; + size_t ReqdThreadsPerBlock[ReqdThreadsPerBlockDimensions]; + int RegsPerThread{0}; + + /// Structure that holds the arguments to the kernel. + /// Note each argument size is known, since it comes + /// from the kernel signature. + /// This is not something can be queried from the CUDA API + /// so there is a hard-coded size (\ref MAX_PARAM_BYTES) + /// and a storage. + struct arguments { + static constexpr size_t MaxParamBytes = 4000u; + using args_t = std::array; + using args_size_t = std::vector; + using args_index_t = std::vector; + args_t Storage; + args_size_t ParamSizes; + args_index_t Indices; + args_size_t OffsetPerIndex; + + std::uint32_t ImplicitOffsetArgs[3] = {0, 0, 0}; + + arguments() { + // Place the implicit offset index at the end of the indicies collection + Indices.emplace_back(&ImplicitOffsetArgs); + } + + /// Add an argument to the kernel. + /// If the argument existed before, it is replaced. + /// Otherwise, it is added. + /// Gaps are filled with empty arguments. + /// Implicit offset argument is kept at the back of the indices collection. + void addArg(size_t Index, size_t Size, const void *Arg, + size_t LocalSize = 0) { + if (Index + 2 > Indices.size()) { + // Move implicit offset argument index with the end + Indices.resize(Index + 2, Indices.back()); + // Ensure enough space for the new argument + ParamSizes.resize(Index + 1); + OffsetPerIndex.resize(Index + 1); + } + ParamSizes[Index] = Size; + // calculate the insertion point on the array + size_t InsertPos = std::accumulate(std::begin(ParamSizes), + std::begin(ParamSizes) + Index, 0); + // Update the stored value for the argument + std::memcpy(&Storage[InsertPos], Arg, Size); + Indices[Index] = &Storage[InsertPos]; + OffsetPerIndex[Index] = LocalSize; + } + + void addLocalArg(size_t Index, size_t Size) { + size_t LocalOffset = this->getLocalSize(); + + // maximum required alignment is the size of the largest vector type + const size_t MaxAlignment = sizeof(double) * 16; + + // for arguments smaller than the maximum alignment simply align to the + // size of the argument + const size_t Alignment = std::min(MaxAlignment, Size); + + // align the argument + size_t AlignedLocalOffset = LocalOffset; + size_t Pad = LocalOffset % Alignment; + if (Pad != 0) { + AlignedLocalOffset += Alignment - Pad; + } + + addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset), + Size + (AlignedLocalOffset - LocalOffset)); + } + + void setImplicitOffset(size_t Size, std::uint32_t *ImplicitOffset) { + assert(Size == sizeof(std::uint32_t) * 3); + std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size); + } + + void clearLocalSize() { + std::fill(std::begin(OffsetPerIndex), std::end(OffsetPerIndex), 0); + } + + const args_index_t &getIndices() const noexcept { return Indices; } + + uint32_t getLocalSize() const { + return std::accumulate(std::begin(OffsetPerIndex), + std::end(OffsetPerIndex), 0); + } + } Args; + + ur_kernel_handle_t_(CUfunction Func, CUfunction FuncWithOffsetParam, + const char *Name, ur_program_handle_t Program, + ur_context_handle_t Context) + : Function{Func}, FunctionWithOffsetParam{FuncWithOffsetParam}, + Name{Name}, Context{Context}, Program{Program}, RefCount{1} { + urProgramRetain(Program); + urContextRetain(Context); + /// Note: this code assumes that there is only one device per context + ur_result_t RetError = urKernelGetGroupInfo( + this, Context->getDevice(), + UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, + sizeof(ReqdThreadsPerBlock), ReqdThreadsPerBlock, nullptr); + (void)RetError; + assert(RetError == UR_RESULT_SUCCESS); + UR_CHECK_ERROR( + cuFuncGetAttribute(&RegsPerThread, CU_FUNC_ATTRIBUTE_NUM_REGS, Func)); + } + + ~ur_kernel_handle_t_() { + urProgramRelease(Program); + urContextRelease(Context); + } + + ur_program_handle_t get_program() const noexcept { return Program; } + + uint32_t incrementReferenceCount() noexcept { return ++RefCount; } + + uint32_t decrementReferenceCount() noexcept { return --RefCount; } + + uint32_t getReferenceCount() const noexcept { return RefCount; } + + native_type get() const noexcept { return Function; }; + + native_type get_with_offset_parameter() const noexcept { + return FunctionWithOffsetParam; + }; + + bool has_with_offset_parameter() const noexcept { + return FunctionWithOffsetParam != nullptr; + } + + ur_context_handle_t getContext() const noexcept { return Context; }; + + const char *getName() const noexcept { return Name.c_str(); } + + /// Get the number of kernel arguments, excluding the implicit global offset. + /// Note this only returns the current known number of arguments, not the + /// real one required by the kernel, since this cannot be queried from + /// the CUDA Driver API + size_t getNumArgs() const noexcept { return Args.Indices.size() - 1; } + + void setKernelArg(int Index, size_t Size, const void *Arg) { + Args.addArg(Index, Size, Arg); + } + + void setKernelLocalArg(int Index, size_t Size) { + Args.addLocalArg(Index, Size); + } + + void setImplicitOffsetArg(size_t Size, std::uint32_t *ImplicitOffset) { + return Args.setImplicitOffset(Size, ImplicitOffset); + } + + const arguments::args_index_t &getArgIndices() const { + return Args.getIndices(); + } + + uint32_t getLocalSize() const noexcept { return Args.getLocalSize(); } + + void clearLocalSize() { Args.clearLocalSize(); } + + size_t getRegsPerThread() const noexcept { return RegsPerThread; }; +}; diff --git a/source/adapters/cuda/memory.cpp b/source/adapters/cuda/memory.cpp new file mode 100644 index 0000000000..d51ba73d67 --- /dev/null +++ b/source/adapters/cuda/memory.cpp @@ -0,0 +1,479 @@ +//===--------- memory.cpp - CUDA Adapter ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#include "common.hpp" +#include "context.hpp" +#include "memory.hpp" + +/// Creates a UR Memory object using a CUDA memory allocation. +/// Can trigger a manual copy depending on the mode. +/// \TODO Implement USE_HOST_PTR using cuHostRegister - See #9789 +/// +UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( + ur_context_handle_t hContext, ur_mem_flags_t flags, size_t size, + const ur_buffer_properties_t *pProperties, ur_mem_handle_t *phBuffer) { + // Validate flags + if (flags & + (UR_MEM_FLAG_USE_HOST_POINTER | UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER)) { + UR_ASSERT(pProperties && pProperties->pHost, + UR_RESULT_ERROR_INVALID_HOST_PTR); + } + UR_ASSERT(size != 0, UR_RESULT_ERROR_INVALID_BUFFER_SIZE); + + // Currently, USE_HOST_PTR is not implemented using host register + // since this triggers a weird segfault after program ends. + // Setting this constant to true enables testing that behavior. + const bool EnableUseHostPtr = false; + const bool PerformInitialCopy = + (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) || + ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && !EnableUseHostPtr); + ur_result_t Result = UR_RESULT_SUCCESS; + ur_mem_handle_t MemObj = nullptr; + + try { + ScopedContext Active(hContext); + CUdeviceptr Ptr = 0; + auto HostPtr = pProperties ? pProperties->pHost : nullptr; + + ur_mem_handle_t_::MemImpl::BufferMem::AllocMode AllocMode = + ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::Classic; + + if ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && EnableUseHostPtr) { + UR_CHECK_ERROR( + cuMemHostRegister(HostPtr, size, CU_MEMHOSTREGISTER_DEVICEMAP)); + UR_CHECK_ERROR(cuMemHostGetDevicePointer(&Ptr, HostPtr, 0)); + AllocMode = ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::UseHostPtr; + } else if (flags & UR_MEM_FLAG_ALLOC_HOST_POINTER) { + UR_CHECK_ERROR(cuMemAllocHost(&HostPtr, size)); + UR_CHECK_ERROR(cuMemHostGetDevicePointer(&Ptr, HostPtr, 0)); + AllocMode = ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr; + } else { + UR_CHECK_ERROR(cuMemAlloc(&Ptr, size)); + if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) { + AllocMode = ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::CopyIn; + } + } + + ur_mem_handle_t parentBuffer = nullptr; + + auto URMemObj = std::unique_ptr(new ur_mem_handle_t_{ + hContext, parentBuffer, flags, AllocMode, Ptr, HostPtr, size}); + if (URMemObj != nullptr) { + MemObj = URMemObj.release(); + if (PerformInitialCopy) { + // Operates on the default stream of the current CUDA context. + UR_CHECK_ERROR(cuMemcpyHtoD(Ptr, HostPtr, size)); + // Synchronize with default stream implicitly used by cuMemcpyHtoD + // to make buffer data available on device before any other UR call + // uses it. + CUstream defaultStream = 0; + UR_CHECK_ERROR(cuStreamSynchronize(defaultStream)); + } + } else { + Result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + } catch (ur_result_t Err) { + Result = Err; + } catch (...) { + Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; + } + + *phBuffer = MemObj; + + return Result; +} + +UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) { + UR_ASSERT(hMem->getReferenceCount() > 0, UR_RESULT_ERROR_INVALID_MEM_OBJECT); + hMem->incrementReferenceCount(); + return UR_RESULT_SUCCESS; +} + +/// Decreases the reference count of the Mem object. +/// If this is zero, calls the relevant CUDA Free function +/// \return UR_RESULT_SUCCESS unless deallocation error +UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) { + ur_result_t Result = UR_RESULT_SUCCESS; + + try { + + // Do nothing if there are other references + if (hMem->decrementReferenceCount() > 0) { + return UR_RESULT_SUCCESS; + } + + // make sure hMem is released in case checkErrorUR throws + std::unique_ptr MemObjPtr(hMem); + + if (hMem->isSubBuffer()) { + return UR_RESULT_SUCCESS; + } + + ScopedContext Active(MemObjPtr->getContext()); + + if (hMem->MemType == ur_mem_handle_t_::Type::Buffer) { + switch (MemObjPtr->Mem.BufferMem.MemAllocMode) { + case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::CopyIn: + case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::Classic: + UR_CHECK_ERROR(cuMemFree(MemObjPtr->Mem.BufferMem.Ptr)); + break; + case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::UseHostPtr: + UR_CHECK_ERROR(cuMemHostUnregister(MemObjPtr->Mem.BufferMem.HostPtr)); + break; + case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr: + UR_CHECK_ERROR(cuMemFreeHost(MemObjPtr->Mem.BufferMem.HostPtr)); + }; + } else if (hMem->MemType == ur_mem_handle_t_::Type::Surface) { + UR_CHECK_ERROR( + cuSurfObjectDestroy(MemObjPtr->Mem.SurfaceMem.getSurface())); + UR_CHECK_ERROR(cuArrayDestroy(MemObjPtr->Mem.SurfaceMem.getArray())); + } + + } catch (ur_result_t Err) { + Result = Err; + } catch (...) { + Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; + } + + if (Result != UR_RESULT_SUCCESS) { + // A reported CUDA error is either an implementation or an asynchronous CUDA + // error for which it is unclear if the function that reported it succeeded + // or not. Either way, the state of the program is compromised and likely + // unrecoverable. + detail::ur::die("Unrecoverable program state reached in urMemRelease"); + } + + return UR_RESULT_SUCCESS; +} + +/// Gets the native CUDA handle of a UR mem object +/// +/// \param[in] hMem The UR mem to get the native CUDA object of. +/// \param[out] phNativeMem Set to the native handle of the UR mem object. +/// +/// \return UR_RESULT_SUCCESS +UR_APIEXPORT ur_result_t UR_APICALL +urMemGetNativeHandle(ur_mem_handle_t hMem, ur_native_handle_t *phNativeMem) { + *phNativeMem = + reinterpret_cast(hMem->Mem.BufferMem.get()); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory, + ur_mem_info_t MemInfoType, + size_t propSize, + void *pMemInfo, + size_t *pPropSizeRet) { + UR_ASSERT(hMemory->isBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + UrReturnHelper ReturnValue(propSize, pMemInfo, pPropSizeRet); + + ScopedContext Active(hMemory->getContext()); + + switch (MemInfoType) { + case UR_MEM_INFO_SIZE: { + try { + size_t AllocSize = 0; + UR_CHECK_ERROR(cuMemGetAddressRange(nullptr, &AllocSize, + hMemory->Mem.BufferMem.Ptr)); + return ReturnValue(AllocSize); + } catch (ur_result_t Err) { + return Err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + } + case UR_MEM_INFO_CONTEXT: { + return ReturnValue(hMemory->getContext()); + } + + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } +} + +UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreateWithNativeHandle( + ur_native_handle_t, ur_context_handle_t, const ur_mem_native_properties_t *, + ur_mem_handle_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreateWithNativeHandle( + ur_native_handle_t, ur_context_handle_t, const ur_image_format_t *, + const ur_image_desc_t *, const ur_mem_native_properties_t *, + ur_mem_handle_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +/// \TODO Not implemented +UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( + ur_context_handle_t hContext, ur_mem_flags_t flags, + const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, + void *pHost, ur_mem_handle_t *phMem) { + if (flags & + (UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER | UR_MEM_FLAG_USE_HOST_POINTER)) { + UR_ASSERT(pHost, UR_RESULT_ERROR_INVALID_HOST_PTR); + } + const bool PerformInitialCopy = + (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) || + ((flags & UR_MEM_FLAG_USE_HOST_POINTER)); + + UR_ASSERT(pImageDesc->stype == UR_STRUCTURE_TYPE_IMAGE_DESC, + UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); + UR_ASSERT(pImageDesc->type <= UR_MEM_TYPE_IMAGE1D_BUFFER, + UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); + UR_ASSERT(pImageDesc->numMipLevel == 0, + UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); + UR_ASSERT(pImageDesc->numSamples == 0, + UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); + if (!pHost) { + UR_ASSERT(pImageDesc->rowPitch == 0, + UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); + UR_ASSERT(pImageDesc->slicePitch == 0, + UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); + } + + ur_result_t Result = UR_RESULT_SUCCESS; + + // We only support RBGA channel order + // TODO: check SYCL CTS and spec. May also have to support BGRA + UR_ASSERT(pImageFormat->channelOrder == UR_IMAGE_CHANNEL_ORDER_RGBA, + UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION); + + // We have to use cuArray3DCreate, which has some caveats. The height and + // depth parameters must be set to 0 produce 1D or 2D arrays. pImageDesc gives + // a minimum value of 1, so we need to convert the answer. + CUDA_ARRAY3D_DESCRIPTOR ArrayDesc; + ArrayDesc.NumChannels = 4; // Only support 4 channel image + ArrayDesc.Flags = 0; // No flags required + ArrayDesc.Width = pImageDesc->width; + if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) { + ArrayDesc.Height = 0; + ArrayDesc.Depth = 0; + } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) { + ArrayDesc.Height = pImageDesc->height; + ArrayDesc.Depth = 0; + } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) { + ArrayDesc.Height = pImageDesc->height; + ArrayDesc.Depth = pImageDesc->depth; + } + + // We need to get this now in bytes for calculating the total image size later + size_t PixelTypeSizeBytes; + + switch (pImageFormat->channelType) { + case UR_IMAGE_CHANNEL_TYPE_UNORM_INT8: + case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8: + ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT8; + PixelTypeSizeBytes = 1; + break; + case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8: + ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT8; + PixelTypeSizeBytes = 1; + break; + case UR_IMAGE_CHANNEL_TYPE_UNORM_INT16: + case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16: + ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT16; + PixelTypeSizeBytes = 2; + break; + case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16: + ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT16; + PixelTypeSizeBytes = 2; + break; + case UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT: + ArrayDesc.Format = CU_AD_FORMAT_HALF; + PixelTypeSizeBytes = 2; + break; + case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32: + ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT32; + PixelTypeSizeBytes = 4; + break; + case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32: + ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT32; + PixelTypeSizeBytes = 4; + break; + case UR_IMAGE_CHANNEL_TYPE_FLOAT: + ArrayDesc.Format = CU_AD_FORMAT_FLOAT; + PixelTypeSizeBytes = 4; + break; + default: + detail::ur::die( + "urMemImageCreate given unsupported image_channel_data_type"); + } + + // When a dimension isn't used pImageDesc has the size set to 1 + size_t PixelSizeBytes = + PixelTypeSizeBytes * 4; // 4 is the only number of channels we support + size_t ImageSizeBytes = PixelSizeBytes * pImageDesc->width * + pImageDesc->height * pImageDesc->depth; + + ScopedContext Active(hContext); + CUarray ImageArray = nullptr; + try { + UR_CHECK_ERROR(cuArray3DCreate(&ImageArray, &ArrayDesc)); + } catch (ur_result_t Err) { + if (Err == UR_RESULT_ERROR_INVALID_VALUE) { + return UR_RESULT_ERROR_INVALID_IMAGE_SIZE; + } + return Err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + try { + if (PerformInitialCopy) { + // We have to use a different copy function for each image dimensionality + if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) { + UR_CHECK_ERROR(cuMemcpyHtoA(ImageArray, 0, pHost, ImageSizeBytes)); + } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) { + CUDA_MEMCPY2D CpyDesc; + memset(&CpyDesc, 0, sizeof(CpyDesc)); + CpyDesc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; + CpyDesc.srcHost = pHost; + CpyDesc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; + CpyDesc.dstArray = ImageArray; + CpyDesc.WidthInBytes = PixelSizeBytes * pImageDesc->width; + CpyDesc.Height = pImageDesc->height; + UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc)); + } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) { + CUDA_MEMCPY3D CpyDesc; + memset(&CpyDesc, 0, sizeof(CpyDesc)); + CpyDesc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; + CpyDesc.srcHost = pHost; + CpyDesc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; + CpyDesc.dstArray = ImageArray; + CpyDesc.WidthInBytes = PixelSizeBytes * pImageDesc->width; + CpyDesc.Height = pImageDesc->height; + CpyDesc.Depth = pImageDesc->depth; + UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc)); + } + } + + // CUDA_RESOURCE_DESC is a union of different structs, shown here + // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXOBJECT.html + // We need to fill it as described here to use it for a surface or texture + // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__SURFOBJECT.html + // CUDA_RESOURCE_DESC::resType must be CU_RESOURCE_TYPE_ARRAY and + // CUDA_RESOURCE_DESC::res::array::hArray must be set to a valid CUDA array + // handle. + // CUDA_RESOURCE_DESC::flags must be set to zero + + CUDA_RESOURCE_DESC ImageResDesc; + ImageResDesc.res.array.hArray = ImageArray; + ImageResDesc.resType = CU_RESOURCE_TYPE_ARRAY; + ImageResDesc.flags = 0; + + CUsurfObject Surface; + UR_CHECK_ERROR(cuSurfObjectCreate(&Surface, &ImageResDesc)); + + auto MemObj = std::unique_ptr(new ur_mem_handle_t_( + hContext, ImageArray, Surface, flags, pImageDesc->type, phMem)); + + if (MemObj == nullptr) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + *phMem = MemObj.release(); + } catch (ur_result_t Err) { + if (ImageArray) { + cuArrayDestroy(ImageArray); + } + return Err; + } catch (...) { + if (ImageArray) { + cuArrayDestroy(ImageArray); + } + return UR_RESULT_ERROR_UNKNOWN; + } + + return Result; +} + +/// \TODO Not implemented +UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo(ur_mem_handle_t, + ur_image_info_t, size_t, + void *, size_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +/// Implements a buffer partition in the CUDA backend. +/// A buffer partition (or a sub-buffer, in OpenCL terms) is simply implemented +/// as an offset over an existing CUDA allocation. +UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition( + ur_mem_handle_t hBuffer, ur_mem_flags_t flags, + ur_buffer_create_type_t bufferCreateType, const ur_buffer_region_t *pRegion, + ur_mem_handle_t *phMem) { + UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT((flags & UR_MEM_FLAGS_MASK) == 0, + UR_RESULT_ERROR_INVALID_ENUMERATION); + UR_ASSERT(hBuffer->isBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(!hBuffer->isSubBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + // Default value for flags means UR_MEM_FLAG_READ_WRITE. + if (flags == 0) { + flags = UR_MEM_FLAG_READ_WRITE; + } + + UR_ASSERT(!(flags & + (UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER | + UR_MEM_FLAG_ALLOC_HOST_POINTER | UR_MEM_FLAG_USE_HOST_POINTER)), + UR_RESULT_ERROR_INVALID_VALUE); + if (hBuffer->MemFlags & UR_MEM_FLAG_WRITE_ONLY) { + UR_ASSERT(!(flags & (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_READ_ONLY)), + UR_RESULT_ERROR_INVALID_VALUE); + } + if (hBuffer->MemFlags & UR_MEM_FLAG_READ_ONLY) { + UR_ASSERT(!(flags & (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)), + UR_RESULT_ERROR_INVALID_VALUE); + } + + UR_ASSERT(bufferCreateType == UR_BUFFER_CREATE_TYPE_REGION, + UR_RESULT_ERROR_INVALID_ENUMERATION); + UR_ASSERT(pRegion != nullptr, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(phMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); + + UR_ASSERT(pRegion->size != 0u, UR_RESULT_ERROR_INVALID_BUFFER_SIZE); + + assert((pRegion->origin <= (pRegion->origin + pRegion->size)) && "Overflow"); + UR_ASSERT( + ((pRegion->origin + pRegion->size) <= hBuffer->Mem.BufferMem.getSize()), + UR_RESULT_ERROR_INVALID_BUFFER_SIZE); + // Retained indirectly due to retaining parent buffer below. + ur_context_handle_t Context = hBuffer->Context; + + ur_mem_handle_t_::MemImpl::BufferMem::AllocMode AllocMode = + ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::Classic; + + assert(hBuffer->Mem.BufferMem.Ptr != + ur_mem_handle_t_::MemImpl::BufferMem::native_type{0}); + ur_mem_handle_t_::MemImpl::BufferMem::native_type Ptr = + hBuffer->Mem.BufferMem.Ptr + pRegion->origin; + + void *HostPtr = nullptr; + if (hBuffer->Mem.BufferMem.HostPtr) { + HostPtr = + static_cast(hBuffer->Mem.BufferMem.HostPtr) + pRegion->origin; + } + + std::unique_ptr MemObj{nullptr}; + try { + MemObj = std::unique_ptr{new ur_mem_handle_t_{ + Context, hBuffer, flags, AllocMode, Ptr, HostPtr, pRegion->size}}; + } catch (ur_result_t Err) { + *phMem = nullptr; + return Err; + } catch (...) { + *phMem = nullptr; + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + *phMem = MemObj.release(); + return UR_RESULT_SUCCESS; +} diff --git a/source/adapters/cuda/memory.hpp b/source/adapters/cuda/memory.hpp new file mode 100644 index 0000000000..81a83b2d67 --- /dev/null +++ b/source/adapters/cuda/memory.hpp @@ -0,0 +1,232 @@ +//===--------- memory.hpp - CUDA Adapter ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#pragma once + +#include +#include +#include + +#include "common.hpp" + +/// UR Mem mapping to CUDA memory allocations, both data and texture/surface. +/// \brief Represents non-SVM allocations on the CUDA backend. +/// Keeps tracks of all mapped regions used for Map/Unmap calls. +/// Only one region can be active at the same time per allocation. +struct ur_mem_handle_t_ { + // Context where the memory object is accessible + ur_context_handle_t Context; + + /// Reference counting of the handler + std::atomic_uint32_t RefCount; + enum class Type { Buffer, Surface, Texture } MemType; + + // Original mem flags passed + ur_mem_flags_t MemFlags; + + /// A UR Memory object represents either plain memory allocations ("Buffers" + /// in OpenCL) or typed allocations ("Images" in OpenCL). + /// In CUDA their API handlers are different. Whereas "Buffers" are allocated + /// as pointer-like structs, "Images" are stored in Textures or Surfaces. + /// This union allows implementation to use either from the same handler. + union MemImpl { + // Handler for plain, pointer-based CUDA allocations + struct BufferMem { + using native_type = CUdeviceptr; + + // If this allocation is a sub-buffer (i.e., a view on an existing + // allocation), this is the pointer to the parent handler structure + ur_mem_handle_t Parent; + // CUDA handler for the pointer + native_type Ptr; + + /// Pointer associated with this device on the host + void *HostPtr; + /// Size of the allocation in bytes + size_t Size; + /// Size of the active mapped region. + size_t MapSize; + /// Offset of the active mapped region. + size_t MapOffset; + /// Pointer to the active mapped region, if any + void *MapPtr; + /// Original flags for the mapped region + ur_map_flags_t MapFlags; + + /** AllocMode + * classic: Just a normal buffer allocated on the device via cuda malloc + * use_host_ptr: Use an address on the host for the device + * copy_in: The data for the device comes from the host but the host + pointer is not available later for re-use + * alloc_host_ptr: Uses pinned-memory allocation + */ + enum class AllocMode { + Classic, + UseHostPtr, + CopyIn, + AllocHostPtr, + } MemAllocMode; + + native_type get() const noexcept { return Ptr; } + + size_t getSize() const noexcept { return Size; } + + void *getMapPtr() const noexcept { return MapPtr; } + + size_t getMapSize() const noexcept { return MapSize; } + + size_t getMapOffset() const noexcept { return MapOffset; } + + /// Returns a pointer to data visible on the host that contains + /// the data on the device associated with this allocation. + /// The offset is used to index into the CUDA allocation. + void *mapToPtr(size_t Size, size_t Offset, + ur_map_flags_t Flags) noexcept { + assert(MapPtr == nullptr); + MapSize = Size; + MapOffset = Offset; + MapFlags = Flags; + if (HostPtr) { + MapPtr = static_cast(HostPtr) + Offset; + } else { + // TODO: Allocate only what is needed based on the offset + MapPtr = static_cast(malloc(this->getSize())); + } + return MapPtr; + } + + /// Detach the allocation from the host memory. + void unmap(void *) noexcept { + assert(MapPtr != nullptr); + + if (MapPtr != HostPtr) { + free(MapPtr); + } + MapPtr = nullptr; + MapSize = 0; + MapOffset = 0; + } + + ur_map_flags_t getMapFlags() const noexcept { + assert(MapPtr != nullptr); + return MapFlags; + } + } BufferMem; + + // Handler data for surface object (i.e. Images) + struct SurfaceMem { + CUarray Array; + CUsurfObject SurfObj; + ur_mem_type_t ImageType; + + CUarray getArray() const noexcept { return Array; } + + CUsurfObject getSurface() const noexcept { return SurfObj; } + + ur_mem_type_t getImageType() const noexcept { return ImageType; } + } SurfaceMem; + + struct ImageMem { + CUarray Array; + void *Handle; + ur_mem_type_t ImageType; + ur_sampler_handle_t Sampler; + + CUarray get_array() const noexcept { return Array; } + + void *get_handle() const noexcept { return Handle; } + + ur_mem_type_t get_image_type() const noexcept { return ImageType; } + + ur_sampler_handle_t get_sampler() const noexcept { return Sampler; } + } ImageMem; + } Mem; + + /// Constructs the UR mem handler for a non-typed allocation ("buffer") + ur_mem_handle_t_(ur_context_handle_t Context, ur_mem_handle_t Parent, + ur_mem_flags_t MemFlags, MemImpl::BufferMem::AllocMode Mode, + CUdeviceptr Ptr, void *HostPtr, size_t Size) + : Context{Context}, RefCount{1}, MemType{Type::Buffer}, MemFlags{ + MemFlags} { + Mem.BufferMem.Ptr = Ptr; + Mem.BufferMem.Parent = Parent; + Mem.BufferMem.HostPtr = HostPtr; + Mem.BufferMem.Size = Size; + Mem.BufferMem.MapSize = 0; + Mem.BufferMem.MapOffset = 0; + Mem.BufferMem.MapPtr = nullptr; + Mem.BufferMem.MapFlags = UR_MAP_FLAG_WRITE; + Mem.BufferMem.MemAllocMode = Mode; + if (isSubBuffer()) { + urMemRetain(Mem.BufferMem.Parent); + } else { + urContextRetain(Context); + } + }; + + /// Constructs the UR allocation for an Image object (surface in CUDA) + ur_mem_handle_t_(ur_context_handle_t Context, CUarray Array, + CUsurfObject Surf, ur_mem_flags_t MemFlags, + ur_mem_type_t ImageType, void *HostPtr) + : Context{Context}, RefCount{1}, MemType{Type::Surface}, MemFlags{ + MemFlags} { + (void)HostPtr; + + Mem.SurfaceMem.Array = Array; + Mem.SurfaceMem.SurfObj = Surf; + Mem.SurfaceMem.ImageType = ImageType; + urContextRetain(Context); + } + + /// Constructs the UR allocation for an unsampled image object + ur_mem_handle_t_(ur_context_handle_t Context, CUarray Array, + CUsurfObject Surf, ur_mem_type_t ImageType) + : Context{Context}, RefCount{1}, MemType{Type::Surface} { + + Mem.ImageMem.Array = Array; + Mem.ImageMem.Handle = (void *)Surf; + Mem.ImageMem.ImageType = ImageType; + Mem.ImageMem.Sampler = nullptr; + urContextRetain(Context); + } + + /// Constructs the UR allocation for a sampled image object + ur_mem_handle_t_(ur_context_handle_t Context, CUarray Array, CUtexObject Tex, + ur_sampler_handle_t Sampler, ur_mem_type_t ImageType) + : Context{Context}, RefCount{1}, MemType{Type::Texture} { + + Mem.ImageMem.Array = Array; + Mem.ImageMem.Handle = (void *)Tex; + Mem.ImageMem.ImageType = ImageType; + Mem.ImageMem.Sampler = Sampler; + urContextRetain(Context); + } + + ~ur_mem_handle_t_() { + if (isBuffer() && isSubBuffer()) { + urMemRelease(Mem.BufferMem.Parent); + return; + } + urContextRelease(Context); + } + + bool isBuffer() const noexcept { return MemType == Type::Buffer; } + + bool isSubBuffer() const noexcept { + return (isBuffer() && (Mem.BufferMem.Parent != nullptr)); + } + + bool isImage() const noexcept { return MemType == Type::Surface; } + + ur_context_handle_t getContext() const noexcept { return Context; } + + uint32_t incrementReferenceCount() noexcept { return ++RefCount; } + + uint32_t decrementReferenceCount() noexcept { return --RefCount; } + + uint32_t getReferenceCount() const noexcept { return RefCount; } +}; diff --git a/source/adapters/cuda/platform.cpp b/source/adapters/cuda/platform.cpp new file mode 100644 index 0000000000..876f83921d --- /dev/null +++ b/source/adapters/cuda/platform.cpp @@ -0,0 +1,195 @@ +//===--------- platform.cpp - CUDA Adapter --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "platform.hpp" +#include "common.hpp" +#include "context.hpp" +#include "device.hpp" + +#include +#include +#include + +UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetInfo( + ur_platform_handle_t hPlatform, ur_platform_info_t PlatformInfoType, + size_t Size, void *pPlatformInfo, size_t *pSizeRet) { + + UR_ASSERT(hPlatform, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UrReturnHelper ReturnValue(Size, pPlatformInfo, pSizeRet); + + switch (PlatformInfoType) { + case UR_PLATFORM_INFO_NAME: + return ReturnValue("NVIDIA CUDA BACKEND"); + case UR_PLATFORM_INFO_VENDOR_NAME: + return ReturnValue("NVIDIA Corporation"); + case UR_PLATFORM_INFO_PROFILE: + return ReturnValue("FULL PROFILE"); + case UR_PLATFORM_INFO_VERSION: { + auto Version = getCudaVersionString(); + return ReturnValue(Version.c_str()); + } + case UR_PLATFORM_INFO_EXTENSIONS: { + return ReturnValue(""); + } + case UR_PLATFORM_INFO_BACKEND: { + return ReturnValue(UR_PLATFORM_BACKEND_CUDA); + } + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + + return UR_RESULT_SUCCESS; +} + +/// Obtains the CUDA platform. +/// There is only one CUDA platform, and contains all devices on the system. +/// Triggers the CUDA Driver initialization (cuInit) the first time, so this +/// must be the first PI API called. +/// +/// However because multiple devices in a context is not currently supported, +/// place each device in a separate platform. +UR_APIEXPORT ur_result_t UR_APICALL +urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries, + ur_platform_handle_t *phPlatforms, uint32_t *pNumPlatforms) { + + try { + static std::once_flag InitFlag; + static uint32_t NumPlatforms = 1; + static std::vector Platforms; + + UR_ASSERT(phPlatforms || pNumPlatforms, UR_RESULT_ERROR_INVALID_VALUE); + UR_ASSERT(!phPlatforms || NumEntries > 0, UR_RESULT_ERROR_INVALID_SIZE); + + ur_result_t Result = UR_RESULT_SUCCESS; + + std::call_once( + InitFlag, + [](ur_result_t &Result) { + UR_CHECK_ERROR(cuInit(0)); + int NumDevices = 0; + UR_CHECK_ERROR(cuDeviceGetCount(&NumDevices)); + try { + // make one platform per device + NumPlatforms = NumDevices; + Platforms.resize(NumDevices); + + for (int i = 0; i < NumDevices; ++i) { + CUdevice Device; + UR_CHECK_ERROR(cuDeviceGet(&Device, i)); + CUcontext Context; + UR_CHECK_ERROR(cuDevicePrimaryCtxRetain(&Context, Device)); + + ScopedContext active(Context); + CUevent EvBase; + UR_CHECK_ERROR(cuEventCreate(&EvBase, CU_EVENT_DEFAULT)); + + // Use default stream to record base event counter + UR_CHECK_ERROR(cuEventRecord(EvBase, 0)); + + Platforms[i].Devices.emplace_back(new ur_device_handle_t_{ + Device, Context, EvBase, &Platforms[i]}); + { + const auto &Dev = Platforms[i].Devices.back().get(); + size_t MaxWorkGroupSize = 0u; + size_t MaxThreadsPerBlock[3] = {}; + UR_CHECK_ERROR(urDeviceGetInfo( + Dev, UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES, + sizeof(MaxThreadsPerBlock), MaxThreadsPerBlock, nullptr)); + + UR_CHECK_ERROR(urDeviceGetInfo( + Dev, UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE, + sizeof(MaxWorkGroupSize), &MaxWorkGroupSize, nullptr)); + + Dev->saveMaxWorkItemSizes(sizeof(MaxThreadsPerBlock), + MaxThreadsPerBlock); + Dev->saveMaxWorkGroupSize(MaxWorkGroupSize); + } + } + } catch (const std::bad_alloc &) { + // Signal out-of-memory situation + for (int i = 0; i < NumDevices; ++i) { + Platforms[i].Devices.clear(); + } + Platforms.clear(); + Result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (ur_result_t Err) { + // Clear and rethrow to allow retry + for (int i = 0; i < NumDevices; ++i) { + Platforms[i].Devices.clear(); + } + Platforms.clear(); + Result = Err; + throw Err; + } catch (...) { + Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; + throw; + } + }, + Result); + + if (pNumPlatforms != nullptr) { + *pNumPlatforms = NumPlatforms; + } + + if (phPlatforms != nullptr) { + for (unsigned i = 0; i < std::min(NumEntries, NumPlatforms); ++i) { + phPlatforms[i] = &Platforms[i]; + } + } + + return Result; + } catch (ur_result_t Err) { + return Err; + } catch (...) { + return UR_RESULT_ERROR_OUT_OF_RESOURCES; + } +} + +UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetApiVersion( + ur_platform_handle_t hDriver, ur_api_version_t *pVersion) { + std::ignore = hDriver; + *pVersion = UR_API_VERSION_CURRENT; + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetNativeHandle( + ur_platform_handle_t hPlatform, ur_native_handle_t *phNativePlatform) { + std::ignore = hPlatform; + std::ignore = phNativePlatform; + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( + ur_native_handle_t hNativePlatform, + const ur_platform_native_properties_t *pProperties, + ur_platform_handle_t *phPlatform) { + std::ignore = hNativePlatform; + std::ignore = pProperties; + std::ignore = phPlatform; + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +// Get CUDA plugin specific backend option. +// Current support is only for optimization options. +// Return empty string for cuda. +// TODO: Determine correct string to be passed. +UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetBackendOption( + ur_platform_handle_t hPlatform, const char *pFrontendOption, + const char **ppPlatformOption) { + std::ignore = hPlatform; + using namespace std::literals; + if (pFrontendOption == nullptr) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + if (pFrontendOption == "-O0"sv || pFrontendOption == "-O1"sv || + pFrontendOption == "-O2"sv || pFrontendOption == "-O3"sv || + pFrontendOption == ""sv) { + *ppPlatformOption = ""; + return UR_RESULT_SUCCESS; + } + return UR_RESULT_ERROR_INVALID_VALUE; +} diff --git a/source/adapters/cuda/platform.hpp b/source/adapters/cuda/platform.hpp new file mode 100644 index 0000000000..c9b6550610 --- /dev/null +++ b/source/adapters/cuda/platform.hpp @@ -0,0 +1,15 @@ +//===--------- platform.hpp - CUDA Adapter --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#pragma once + +#include +#include + +struct ur_platform_handle_t_ { + std::vector> Devices; +}; diff --git a/source/adapters/cuda/program.cpp b/source/adapters/cuda/program.cpp new file mode 100644 index 0000000000..7e238dd7fe --- /dev/null +++ b/source/adapters/cuda/program.cpp @@ -0,0 +1,452 @@ +//===--------- program.cpp - CUDA Adapter ---------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "program.hpp" + +bool getMaxRegistersJitOptionValue(const std::string &BuildOptions, + unsigned int &Value) { + using namespace std::string_view_literals; + const std::size_t OptionPos = BuildOptions.find_first_of("maxrregcount"sv); + if (OptionPos == std::string::npos) { + return false; + } + + const std::size_t DelimPos = BuildOptions.find('=', OptionPos + 1u); + if (DelimPos == std::string::npos) { + return false; + } + + const std::size_t Length = BuildOptions.length(); + const std::size_t StartPos = DelimPos + 1u; + if (DelimPos == std::string::npos || StartPos >= Length) { + return false; + } + + std::size_t Pos = StartPos; + while (Pos < Length && + std::isdigit(static_cast(BuildOptions[Pos]))) { + Pos++; + } + + const std::string ValueString = BuildOptions.substr(StartPos, Pos - StartPos); + if (ValueString.empty()) { + return false; + } + + Value = static_cast(std::stoi(ValueString)); + return true; +} + +ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t Context) + : Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1}, + Context{Context}, KernelReqdWorkGroupSizeMD{} { + urContextRetain(Context); +} + +ur_program_handle_t_::~ur_program_handle_t_() { urContextRelease(Context); } + +std::pair +splitMetadataName(const std::string &metadataName) { + size_t splitPos = metadataName.rfind('@'); + if (splitPos == std::string::npos) + return std::make_pair(metadataName, std::string{}); + return std::make_pair(metadataName.substr(0, splitPos), + metadataName.substr(splitPos, metadataName.length())); +} + +ur_result_t +ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata, + size_t Length) { + for (size_t i = 0; i < Length; ++i) { + const ur_program_metadata_t MetadataElement = Metadata[i]; + std::string MetadataElementName{MetadataElement.pName}; + + auto [Prefix, Tag] = splitMetadataName(MetadataElementName); + + if (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE) { + // If metadata is reqd_work_group_size, record it for the corresponding + // kernel name. + size_t MDElemsSize = MetadataElement.size - sizeof(std::uint64_t); + + // Expect between 1 and 3 32-bit integer values. + UR_ASSERT(MDElemsSize >= sizeof(std::uint32_t) && + MDElemsSize <= sizeof(std::uint32_t) * 3, + UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); + + // Get pointer to data, skipping 64-bit size at the start of the data. + const char *ValuePtr = + reinterpret_cast(MetadataElement.value.pData) + + sizeof(std::uint64_t); + // Read values and pad with 1's for values not present. + std::uint32_t ReqdWorkGroupElements[] = {1, 1, 1}; + std::memcpy(ReqdWorkGroupElements, ValuePtr, MDElemsSize); + KernelReqdWorkGroupSizeMD[Prefix] = + std::make_tuple(ReqdWorkGroupElements[0], ReqdWorkGroupElements[1], + ReqdWorkGroupElements[2]); + } else if (Tag == __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING) { + const char *MetadataValPtr = + reinterpret_cast(MetadataElement.value.pData) + + sizeof(std::uint64_t); + const char *MetadataValPtrEnd = + MetadataValPtr + MetadataElement.size - sizeof(std::uint64_t); + GlobalIDMD[Prefix] = std::string{MetadataValPtr, MetadataValPtrEnd}; + } + } + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_program_handle_t_::setBinary(const char *Source, size_t Length) { + // Do not re-set program binary data which has already been set as that will + // delete the old binary data. + UR_ASSERT(Binary == nullptr && BinarySizeInBytes == 0, + UR_RESULT_ERROR_INVALID_OPERATION); + Binary = Source; + BinarySizeInBytes = Length; + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_program_handle_t_::buildProgram(const char *BuildOptions) { + if (BuildOptions) { + this->BuildOptions = BuildOptions; + } + + constexpr const unsigned int NumberOfOptions = 4u; + + std::vector Options(NumberOfOptions); + std::vector OptionVals(NumberOfOptions); + + // Pass a buffer for info messages + Options[0] = CU_JIT_INFO_LOG_BUFFER; + OptionVals[0] = (void *)InfoLog; + // Pass the size of the info buffer + Options[1] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; + OptionVals[1] = (void *)(long)MaxLogSize; + // Pass a buffer for error message + Options[2] = CU_JIT_ERROR_LOG_BUFFER; + OptionVals[2] = (void *)ErrorLog; + // Pass the size of the error buffer + Options[3] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; + OptionVals[3] = (void *)(long)MaxLogSize; + + if (!this->BuildOptions.empty()) { + unsigned int MaxRegs; + bool Valid = getMaxRegistersJitOptionValue(BuildOptions, MaxRegs); + if (Valid) { + Options.push_back(CU_JIT_MAX_REGISTERS); + OptionVals.push_back(reinterpret_cast(MaxRegs)); + } + } + + UR_CHECK_ERROR(cuModuleLoadDataEx(&Module, static_cast(Binary), + Options.size(), Options.data(), + OptionVals.data())); + + BuildStatus = UR_PROGRAM_BUILD_STATUS_SUCCESS; + + // If no exception, result is correct + return UR_RESULT_SUCCESS; +} + +/// Finds kernel names by searching for entry points in the PTX source, as the +/// CUDA driver API doesn't expose an operation for this. +/// Note: This is currently only being used by the SYCL program class for the +/// has_kernel method, so an alternative would be to move the has_kernel +/// query to UR and use cuModuleGetFunction to check for a kernel. +/// Note: Another alternative is to add kernel names as metadata, like with +/// reqd_work_group_size. +ur_result_t getKernelNames(ur_program_handle_t) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +/// CUDA will handle the PTX/CUBIN binaries internally through CUmodule object. +/// So, urProgramCreateWithIL and urProgramCreateWithBinary are equivalent in +/// terms of CUDA adapter. See \ref urProgramCreateWithBinary. +UR_APIEXPORT ur_result_t UR_APICALL +urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL, + size_t length, const ur_program_properties_t *pProperties, + ur_program_handle_t *phProgram) { + ur_device_handle_t hDevice = hContext->getDevice(); + auto pBinary = reinterpret_cast(pIL); + + return urProgramCreateWithBinary(hContext, hDevice, length, pBinary, + pProperties, phProgram); +} + +/// CUDA will handle the PTX/CUBIN binaries internally through a call to +/// cuModuleLoadDataEx. So, urProgramCompile and urProgramBuild are equivalent +/// in terms of CUDA adapter. \TODO Implement asynchronous compilation +UR_APIEXPORT ur_result_t UR_APICALL +urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram, + const char *pOptions) { + return urProgramBuild(hContext, hProgram, pOptions); +} + +/// Loads the images from a UR program into a CUmodule that can be +/// used later on to extract functions (kernels). +/// See \ref ur_program_handle_t for implementation details. +UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext, + ur_program_handle_t hProgram, + const char *pOptions) { + std::ignore = hContext; + + ur_result_t Result = UR_RESULT_SUCCESS; + + try { + ScopedContext Active(hProgram->getContext()); + + hProgram->buildProgram(pOptions); + + } catch (ur_result_t Err) { + Result = Err; + } + return Result; +} + +/// Creates a new UR program object that is the outcome of linking all input +/// programs. +/// \TODO Implement linker options, requires mapping of OpenCL to CUDA +UR_APIEXPORT ur_result_t UR_APICALL +urProgramLink(ur_context_handle_t hContext, uint32_t count, + const ur_program_handle_t *phPrograms, const char *pOptions, + ur_program_handle_t *phProgram) { + ur_result_t Result = UR_RESULT_SUCCESS; + + try { + ScopedContext Active(hContext); + + CUlinkState State; + std::unique_ptr RetProgram{ + new ur_program_handle_t_{hContext}}; + + UR_CHECK_ERROR(cuLinkCreate(0, nullptr, nullptr, &State)); + try { + for (size_t i = 0; i < count; ++i) { + ur_program_handle_t Program = phPrograms[i]; + UR_CHECK_ERROR(cuLinkAddData( + State, CU_JIT_INPUT_PTX, const_cast(Program->Binary), + Program->BinarySizeInBytes, nullptr, 0, nullptr, nullptr)); + } + void *CuBin = nullptr; + size_t CuBinSize = 0; + UR_CHECK_ERROR(cuLinkComplete(State, &CuBin, &CuBinSize)); + + Result = + RetProgram->setBinary(static_cast(CuBin), CuBinSize); + + Result = RetProgram->buildProgram(pOptions); + } catch (...) { + // Upon error attempt cleanup + UR_CHECK_ERROR(cuLinkDestroy(State)); + throw; + } + + UR_CHECK_ERROR(cuLinkDestroy(State)); + *phProgram = RetProgram.release(); + + } catch (ur_result_t Err) { + Result = Err; + } + return Result; +} + +/// Created a UR program object from a CUDA program handle. +/// TODO: Implement this. +/// NOTE: The created UR object takes ownership of the native handle. +/// +/// \param[in] nativeHandle The native handle to create UR program object from. +/// \param[in] context The UR context of the program. +/// \param[out] program Set to the UR program object created from native handle. +/// +/// \return TBD +UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithNativeHandle( + ur_native_handle_t, ur_context_handle_t, + const ur_program_native_properties_t *, ur_program_handle_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urProgramGetBuildInfo(ur_program_handle_t hProgram, ur_device_handle_t hDevice, + ur_program_build_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet) { + std::ignore = hDevice; + + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + switch (propName) { + case UR_PROGRAM_BUILD_INFO_STATUS: { + return ReturnValue(hProgram->BuildStatus); + } + case UR_PROGRAM_BUILD_INFO_OPTIONS: + return ReturnValue(hProgram->BuildOptions.c_str()); + case UR_PROGRAM_BUILD_INFO_LOG: + return ReturnValue(hProgram->InfoLog, hProgram->MaxLogSize); + default: + break; + } + return UR_RESULT_ERROR_INVALID_ENUMERATION; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName, + size_t propSize, void *pProgramInfo, size_t *pPropSizeRet) { + UrReturnHelper ReturnValue(propSize, pProgramInfo, pPropSizeRet); + + switch (propName) { + case UR_PROGRAM_INFO_REFERENCE_COUNT: + return ReturnValue(hProgram->getReferenceCount()); + case UR_PROGRAM_INFO_CONTEXT: + return ReturnValue(hProgram->Context); + case UR_PROGRAM_INFO_NUM_DEVICES: + return ReturnValue(1u); + case UR_PROGRAM_INFO_DEVICES: + return ReturnValue(&hProgram->Context->DeviceID, 1); + case UR_PROGRAM_INFO_SOURCE: + return ReturnValue(hProgram->Binary); + case UR_PROGRAM_INFO_BINARY_SIZES: + return ReturnValue(&hProgram->BinarySizeInBytes, 1); + case UR_PROGRAM_INFO_BINARIES: + return ReturnValue(&hProgram->Binary, 1); + case UR_PROGRAM_INFO_KERNEL_NAMES: + /* TODO: Add implementation for getKernelNames */ + UR_ASSERT(getKernelNames(hProgram), UR_RESULT_ERROR_UNSUPPORTED_FEATURE); + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + case UR_PROGRAM_INFO_NUM_KERNELS: + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + default: + break; + } + return UR_RESULT_ERROR_INVALID_ENUMERATION; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urProgramRetain(ur_program_handle_t hProgram) { + UR_ASSERT(hProgram->getReferenceCount() > 0, UR_RESULT_ERROR_INVALID_PROGRAM); + hProgram->incrementReferenceCount(); + return UR_RESULT_SUCCESS; +} + +/// Decreases the reference count of a ur_program_handle_t object. +/// When the reference count reaches 0, it unloads the module from +/// the context. +UR_APIEXPORT ur_result_t UR_APICALL +urProgramRelease(ur_program_handle_t hProgram) { + // double delete or someone is messing with the ref count. + // either way, cannot safely proceed. + UR_ASSERT(hProgram->getReferenceCount() != 0, + UR_RESULT_ERROR_INVALID_PROGRAM); + + // decrement ref count. If it is 0, delete the program. + if (hProgram->decrementReferenceCount() == 0) { + + std::unique_ptr ProgramPtr{hProgram}; + + ur_result_t Result = UR_RESULT_ERROR_INVALID_PROGRAM; + + try { + ScopedContext Active(hProgram->getContext()); + auto cuModule = hProgram->get(); + // "0" is a valid handle for a cuModule, so the best way to check if we + // actually loaded a module and need to unload it is to look at the build + // status. + if (hProgram->BuildStatus == UR_PROGRAM_BUILD_STATUS_SUCCESS) { + UR_CHECK_ERROR(cuModuleUnload(cuModule)); + Result = UR_RESULT_SUCCESS; + } else if (hProgram->BuildStatus == UR_PROGRAM_BUILD_STATUS_NONE) { + // Nothing to free. + Result = UR_RESULT_SUCCESS; + } + } catch (...) { + Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; + } + + return Result; + } + + return UR_RESULT_SUCCESS; +} + +/// Gets the native CUDA handle of a UR program object +/// +/// \param[in] program The UR program handle to get the native CUDA object of. +/// \param[out] nativeHandle Set to the native handle of the UR program object. +/// +/// \return ur_result_t +UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle( + ur_program_handle_t hProgram, ur_native_handle_t *nativeHandle) { + *nativeHandle = reinterpret_cast(hProgram->get()); + return UR_RESULT_SUCCESS; +} + +/// Loads images from a list of PTX or CUBIN binaries. +/// Note: No calls to CUDA driver API in this function, only store binaries +/// for later. +/// +/// Note: Only supports one device +/// +UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( + ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size, + const uint8_t *pBinary, const ur_program_properties_t *pProperties, + ur_program_handle_t *phProgram) { + UR_ASSERT(hContext->getDevice()->get() == hDevice->get(), + UR_RESULT_ERROR_INVALID_CONTEXT); + UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE); + + ur_result_t Result = UR_RESULT_SUCCESS; + + std::unique_ptr RetProgram{ + new ur_program_handle_t_{hContext}}; + + if (pProperties) { + if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } else if (pProperties->count == 0 && pProperties->pMetadatas != nullptr) { + return UR_RESULT_ERROR_INVALID_SIZE; + } + Result = + RetProgram->setMetadata(pProperties->pMetadatas, pProperties->count); + } + UR_ASSERT(Result == UR_RESULT_SUCCESS, Result); + + auto pBinary_string = reinterpret_cast(pBinary); + + Result = RetProgram->setBinary(pBinary_string, size); + UR_ASSERT(Result == UR_RESULT_SUCCESS, Result); + + *phProgram = RetProgram.release(); + + return Result; +} + +// This entry point is only used for native specialization constants (SPIR-V), +// and the CUDA plugin is AOT only so this entry point is not supported. +UR_APIEXPORT ur_result_t UR_APICALL urProgramSetSpecializationConstants( + ur_program_handle_t, uint32_t, const ur_specialization_constant_info_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urProgramGetFunctionPointer( + ur_device_handle_t hDevice, ur_program_handle_t hProgram, + const char *pFunctionName, void **ppFunctionPointer) { + // Check if device passed is the same the device bound to the context + UR_ASSERT(hDevice == hProgram->getContext()->getDevice(), + UR_RESULT_ERROR_INVALID_DEVICE); + + CUfunction Func; + CUresult Ret = cuModuleGetFunction(&Func, hProgram->get(), pFunctionName); + *ppFunctionPointer = Func; + ur_result_t Result = UR_RESULT_SUCCESS; + + if (Ret != CUDA_SUCCESS && Ret != CUDA_ERROR_NOT_FOUND) + UR_CHECK_ERROR(Ret); + if (Ret == CUDA_ERROR_NOT_FOUND) { + *ppFunctionPointer = 0; + Result = UR_RESULT_ERROR_INVALID_FUNCTION_NAME; + } + + return Result; +} diff --git a/source/adapters/cuda/program.hpp b/source/adapters/cuda/program.hpp new file mode 100644 index 0000000000..99ed9a3862 --- /dev/null +++ b/source/adapters/cuda/program.hpp @@ -0,0 +1,54 @@ +//===--------- program.hpp - CUDA Adapter ---------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#pragma once + +#include +#include + +#include +#include + +#include "context.hpp" + +struct ur_program_handle_t_ { + using native_type = CUmodule; + native_type Module; + const char *Binary; + size_t BinarySizeInBytes; + std::atomic_uint32_t RefCount; + ur_context_handle_t Context; + + // Metadata + std::unordered_map> + KernelReqdWorkGroupSizeMD; + std::unordered_map GlobalIDMD; + + constexpr static size_t MaxLogSize = 8192u; + + char ErrorLog[MaxLogSize], InfoLog[MaxLogSize]; + std::string BuildOptions; + ur_program_build_status_t BuildStatus = UR_PROGRAM_BUILD_STATUS_NONE; + + ur_program_handle_t_(ur_context_handle_t Context); + ~ur_program_handle_t_(); + + ur_result_t setMetadata(const ur_program_metadata_t *Metadata, size_t Length); + + ur_result_t setBinary(const char *Binary, size_t BinarySizeInBytes); + + ur_result_t buildProgram(const char *BuildOptions); + ur_context_handle_t getContext() const { return Context; }; + + native_type get() const noexcept { return Module; }; + + uint32_t incrementReferenceCount() noexcept { return ++RefCount; } + + uint32_t decrementReferenceCount() noexcept { return --RefCount; } + + uint32_t getReferenceCount() const noexcept { return RefCount; } +}; diff --git a/source/adapters/cuda/queue.cpp b/source/adapters/cuda/queue.cpp new file mode 100644 index 0000000000..ed356275fe --- /dev/null +++ b/source/adapters/cuda/queue.cpp @@ -0,0 +1,319 @@ +//===--------- queue.cpp - CUDA Adapter -----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "queue.hpp" +#include "common.hpp" +#include "context.hpp" +#include "event.hpp" + +#include +#include + +void ur_queue_handle_t_::computeStreamWaitForBarrierIfNeeded(CUstream Stream, + uint32_t StreamI) { + if (BarrierEvent && !ComputeAppliedBarrier[StreamI]) { + UR_CHECK_ERROR(cuStreamWaitEvent(Stream, BarrierEvent, 0)); + ComputeAppliedBarrier[StreamI] = true; + } +} + +void ur_queue_handle_t_::transferStreamWaitForBarrierIfNeeded( + CUstream Stream, uint32_t StreamI) { + if (BarrierEvent && !TransferAppliedBarrier[StreamI]) { + UR_CHECK_ERROR(cuStreamWaitEvent(Stream, BarrierEvent, 0)); + TransferAppliedBarrier[StreamI] = true; + } +} + +CUstream ur_queue_handle_t_::getNextComputeStream(uint32_t *StreamToken) { + uint32_t StreamI; + uint32_t Token; + while (true) { + if (NumComputeStreams < ComputeStreams.size()) { + // the check above is for performance - so as not to lock mutex every time + std::lock_guard guard(ComputeStreamMutex); + // The second check is done after mutex is locked so other threads can not + // change NumComputeStreams after that + if (NumComputeStreams < ComputeStreams.size()) { + UR_CHECK_ERROR( + cuStreamCreate(&ComputeStreams[NumComputeStreams++], Flags)); + } + } + Token = ComputeStreamIndex++; + StreamI = Token % ComputeStreams.size(); + // if a stream has been reused before it was next selected round-robin + // fashion, we want to delay its next use and instead select another one + // that is more likely to have completed all the enqueued work. + if (DelayCompute[StreamI]) { + DelayCompute[StreamI] = false; + } else { + break; + } + } + if (StreamToken) { + *StreamToken = Token; + } + CUstream res = ComputeStreams[StreamI]; + computeStreamWaitForBarrierIfNeeded(res, StreamI); + return res; +} + +CUstream ur_queue_handle_t_::getNextComputeStream( + uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, + ur_stream_guard_ &Guard, uint32_t *StreamToken) { + for (uint32_t i = 0; i < NumEventsInWaitList; i++) { + uint32_t Token = EventWaitList[i]->getComputeStreamToken(); + if (reinterpret_cast(EventWaitList[i]->getQueue()) == + this && + canReuseStream(Token)) { + std::unique_lock ComputeSyncGuard(ComputeStreamSyncMutex); + // redo the check after lock to avoid data races on + // LastSyncComputeStreams + if (canReuseStream(Token)) { + uint32_t StreamI = Token % DelayCompute.size(); + DelayCompute[StreamI] = true; + if (StreamToken) { + *StreamToken = Token; + } + Guard = ur_stream_guard_{std::move(ComputeSyncGuard)}; + CUstream Result = EventWaitList[i]->getStream(); + computeStreamWaitForBarrierIfNeeded(Result, StreamI); + return Result; + } + } + } + Guard = {}; + return getNextComputeStream(StreamToken); +} + +CUstream ur_queue_handle_t_::getNextTransferStream() { + if (TransferStreams.empty()) { // for example in in-order queue + return getNextComputeStream(); + } + if (NumTransferStreams < TransferStreams.size()) { + // the check above is for performance - so as not to lock mutex every time + std::lock_guard Guuard(TransferStreamMutex); + // The second check is done after mutex is locked so other threads can not + // change NumTransferStreams after that + if (NumTransferStreams < TransferStreams.size()) { + UR_CHECK_ERROR( + cuStreamCreate(&TransferStreams[NumTransferStreams++], Flags)); + } + } + uint32_t StreamI = TransferStreamIndex++ % TransferStreams.size(); + CUstream Result = TransferStreams[StreamI]; + transferStreamWaitForBarrierIfNeeded(Result, StreamI); + return Result; +} + +/// Creates a `ur_queue_handle_t` object on the CUDA backend. +/// Valid properties +/// * __SYCL_PI_CUDA_USE_DEFAULT_STREAM -> CU_STREAM_DEFAULT +/// * __SYCL_PI_CUDA_SYNC_WITH_DEFAULT -> CU_STREAM_NON_BLOCKING +UR_APIEXPORT ur_result_t UR_APICALL +urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice, + const ur_queue_properties_t *pProps, ur_queue_handle_t *phQueue) { + try { + std::unique_ptr Queue{nullptr}; + + if (hContext->getDevice() != hDevice) { + *phQueue = nullptr; + return UR_RESULT_ERROR_INVALID_DEVICE; + } + + unsigned int Flags = CU_STREAM_NON_BLOCKING; + ur_queue_flags_t URFlags = 0; + bool IsOutOfOrder = false; + if (pProps && pProps->stype == UR_STRUCTURE_TYPE_QUEUE_PROPERTIES) { + URFlags = pProps->flags; + if (URFlags == UR_QUEUE_FLAG_USE_DEFAULT_STREAM) { + Flags = CU_STREAM_DEFAULT; + } else if (URFlags == UR_QUEUE_FLAG_SYNC_WITH_DEFAULT_STREAM) { + Flags = 0; + } + + if (URFlags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) { + IsOutOfOrder = true; + } + } + + std::vector ComputeCuStreams( + IsOutOfOrder ? ur_queue_handle_t_::DefaultNumComputeStreams : 1); + std::vector TransferCuStreams( + IsOutOfOrder ? ur_queue_handle_t_::DefaultNumTransferStreams : 0); + + Queue = std::unique_ptr(new ur_queue_handle_t_{ + std::move(ComputeCuStreams), std::move(TransferCuStreams), hContext, + hDevice, Flags, URFlags}); + + *phQueue = Queue.release(); + + return UR_RESULT_SUCCESS; + } catch (ur_result_t Err) { + + return Err; + + } catch (...) { + + return UR_RESULT_ERROR_OUT_OF_RESOURCES; + } +} + +UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(ur_queue_handle_t hQueue) { + assert(hQueue->getReferenceCount() > 0); + + hQueue->incrementReferenceCount(); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) { + if (hQueue->decrementReferenceCount() > 0) { + return UR_RESULT_SUCCESS; + } + + try { + std::unique_ptr Queue(hQueue); + + if (!hQueue->backendHasOwnership()) + return UR_RESULT_SUCCESS; + + ScopedContext Active(hQueue->getContext()); + + hQueue->forEachStream([](CUstream S) { + UR_CHECK_ERROR(cuStreamSynchronize(S)); + UR_CHECK_ERROR(cuStreamDestroy(S)); + }); + + return UR_RESULT_SUCCESS; + } catch (ur_result_t Err) { + return Err; + } catch (...) { + return UR_RESULT_ERROR_OUT_OF_RESOURCES; + } +} + +UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) { + ur_result_t Result = UR_RESULT_SUCCESS; + + try { + ScopedContext active(hQueue->getContext()); + + hQueue->syncStreams( + [](CUstream s) { UR_CHECK_ERROR(cuStreamSynchronize(s)); }); + + } catch (ur_result_t Err) { + + Result = Err; + + } catch (...) { + + Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; + } + + return Result; +} + +// There is no CUDA counterpart for queue flushing and we don't run into the +// same problem of having to flush cross-queue dependencies as some of the +// other plugins, so it can be left as no-op. +UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t hQueue) { + std::ignore = hQueue; + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urQueueGetNativeHandle(ur_queue_handle_t hQueue, ur_queue_native_desc_t *pDesc, + ur_native_handle_t *phNativeQueue) { + std::ignore = pDesc; + + ScopedContext Active(hQueue->getContext()); + *phNativeQueue = + reinterpret_cast(hQueue->getNextComputeStream()); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( + ur_native_handle_t hNativeQueue, ur_context_handle_t hContext, + ur_device_handle_t hDevice, const ur_queue_native_properties_t *pProperties, + ur_queue_handle_t *phQueue) { + (void)hDevice; + + unsigned int CuFlags; + CUstream CuStream = reinterpret_cast(hNativeQueue); + + UR_CHECK_ERROR(cuStreamGetFlags(CuStream, &CuFlags)); + + ur_queue_flags_t Flags = 0; + if (CuFlags == CU_STREAM_DEFAULT) + Flags = UR_QUEUE_FLAG_USE_DEFAULT_STREAM; + else if (CuFlags == CU_STREAM_NON_BLOCKING) + Flags = UR_QUEUE_FLAG_SYNC_WITH_DEFAULT_STREAM; + else + detail::ur::die("Unknown cuda stream"); + + std::vector ComputeCuStreams(1, CuStream); + std::vector TransferCuStreams(0); + + // Create queue and set num_compute_streams to 1, as computeCuStreams has + // valid stream + *phQueue = + new ur_queue_handle_t_{std::move(ComputeCuStreams), + std::move(TransferCuStreams), + hContext, + hContext->getDevice(), + CuFlags, + Flags, + /*backend_owns*/ pProperties->isNativeHandleOwned}; + (*phQueue)->NumComputeStreams = 1; + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue, + ur_queue_info_t propName, + size_t propValueSize, + void *pPropValue, + size_t *pPropSizeRet) { + UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropSizeRet); + + switch (propName) { + case UR_QUEUE_INFO_CONTEXT: + return ReturnValue(hQueue->Context); + case UR_QUEUE_INFO_DEVICE: + return ReturnValue(hQueue->Device); + case UR_QUEUE_INFO_REFERENCE_COUNT: + return ReturnValue(hQueue->getReferenceCount()); + case UR_QUEUE_INFO_FLAGS: + return ReturnValue(hQueue->URFlags); + case UR_QUEUE_INFO_EMPTY: { + try { + bool IsReady = hQueue->allOf([](CUstream S) -> bool { + const CUresult Ret = cuStreamQuery(S); + if (Ret == CUDA_SUCCESS) + return true; + + if (Ret == CUDA_ERROR_NOT_READY) + return false; + + UR_CHECK_ERROR(Ret); + return false; + }); + return ReturnValue(IsReady); + } catch (ur_result_t Err) { + return Err; + } catch (...) { + return UR_RESULT_ERROR_OUT_OF_RESOURCES; + } + } + case UR_QUEUE_INFO_DEVICE_DEFAULT: + case UR_QUEUE_INFO_SIZE: + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } +} diff --git a/source/adapters/cuda/queue.hpp b/source/adapters/cuda/queue.hpp new file mode 100644 index 0000000000..093cc57b85 --- /dev/null +++ b/source/adapters/cuda/queue.hpp @@ -0,0 +1,245 @@ +//===--------- queue.hpp - CUDA Adapter -----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#pragma once + +#include + +#include +#include +#include + +using ur_stream_guard_ = std::unique_lock; + +/// UR queue mapping on to CUstream objects. +/// +struct ur_queue_handle_t_ { + + using native_type = CUstream; + static constexpr int DefaultNumComputeStreams = 128; + static constexpr int DefaultNumTransferStreams = 64; + + std::vector ComputeStreams; + std::vector TransferStreams; + // delay_compute_ keeps track of which streams have been recently reused and + // their next use should be delayed. If a stream has been recently reused it + // will be skipped the next time it would be selected round-robin style. When + // skipped, its delay flag is cleared. + std::vector DelayCompute; + // keep track of which streams have applied barrier + std::vector ComputeAppliedBarrier; + std::vector TransferAppliedBarrier; + ur_context_handle_t_ *Context; + ur_device_handle_t_ *Device; + CUevent BarrierEvent = nullptr; + CUevent BarrierTmpEvent = nullptr; + std::atomic_uint32_t RefCount; + std::atomic_uint32_t EventCount; + std::atomic_uint32_t ComputeStreamIndex; + std::atomic_uint32_t TransferStreamIndex; + unsigned int NumComputeStreams; + unsigned int NumTransferStreams; + unsigned int LastSyncComputeStreams; + unsigned int LastSyncTransferStreams; + unsigned int Flags; + ur_queue_flags_t URFlags; + // When ComputeStreamSyncMutex and ComputeStreamMutex both need to be + // locked at the same time, ComputeStreamSyncMutex should be locked first + // to avoid deadlocks + std::mutex ComputeStreamSyncMutex; + std::mutex ComputeStreamMutex; + std::mutex TransferStreamMutex; + std::mutex BarrierMutex; + bool HasOwnership; + + ur_queue_handle_t_(std::vector &&ComputeStreams, + std::vector &&TransferStreams, + ur_context_handle_t_ *Context, ur_device_handle_t_ *Device, + unsigned int Flags, ur_queue_flags_t URFlags, + bool BackendOwns = true) + : ComputeStreams{std::move(ComputeStreams)}, TransferStreams{std::move( + TransferStreams)}, + DelayCompute(this->ComputeStreams.size(), false), + ComputeAppliedBarrier(this->ComputeStreams.size()), + TransferAppliedBarrier(this->TransferStreams.size()), Context{Context}, + Device{Device}, RefCount{1}, EventCount{0}, ComputeStreamIndex{0}, + TransferStreamIndex{0}, NumComputeStreams{0}, NumTransferStreams{0}, + LastSyncComputeStreams{0}, LastSyncTransferStreams{0}, Flags(Flags), + URFlags(URFlags), HasOwnership{BackendOwns} { + urContextRetain(Context); + urDeviceRetain(Device); + } + + ~ur_queue_handle_t_() { + urContextRelease(Context); + urDeviceRelease(Device); + } + + void computeStreamWaitForBarrierIfNeeded(CUstream Strean, uint32_t StreamI); + void transferStreamWaitForBarrierIfNeeded(CUstream Stream, uint32_t StreamI); + + // get_next_compute/transfer_stream() functions return streams from + // appropriate pools in round-robin fashion + native_type getNextComputeStream(uint32_t *StreamToken = nullptr); + // this overload tries select a stream that was used by one of dependencies. + // If that is not possible returns a new stream. If a stream is reused it + // returns a lock that needs to remain locked as long as the stream is in use + native_type getNextComputeStream(uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList, + ur_stream_guard_ &Guard, + uint32_t *StreamToken = nullptr); + native_type getNextTransferStream(); + native_type get() { return getNextComputeStream(); }; + + bool hasBeenSynchronized(uint32_t StreamToken) { + // stream token not associated with one of the compute streams + if (StreamToken == std::numeric_limits::max()) { + return false; + } + return LastSyncComputeStreams > StreamToken; + } + + bool canReuseStream(uint32_t StreamToken) { + // stream token not associated with one of the compute streams + if (StreamToken == std::numeric_limits::max()) { + return false; + } + // If the command represented by the stream token was not the last command + // enqueued to the stream we can not reuse the stream - we need to allow for + // commands enqueued after it and the one we are about to enqueue to run + // concurrently + bool IsLastCommand = + (ComputeStreamIndex - StreamToken) <= ComputeStreams.size(); + // If there was a barrier enqueued to the queue after the command + // represented by the stream token we should not reuse the stream, as we can + // not take that stream into account for the bookkeeping for the next + // barrier - such a stream would not be synchronized with. Performance-wise + // it does not matter that we do not reuse the stream, as the work + // represented by the stream token is guaranteed to be complete by the + // barrier before any work we are about to enqueue to the stream will start, + // so the event does not need to be synchronized with. + return IsLastCommand && !hasBeenSynchronized(StreamToken); + } + + template bool allOf(T &&F) { + { + std::lock_guard ComputeGuard(ComputeStreamMutex); + unsigned int End = std::min( + static_cast(ComputeStreams.size()), NumComputeStreams); + if (!std::all_of(ComputeStreams.begin(), ComputeStreams.begin() + End, F)) + return false; + } + { + std::lock_guard TransferGuard(TransferStreamMutex); + unsigned int End = + std::min(static_cast(TransferStreams.size()), + NumTransferStreams); + if (!std::all_of(TransferStreams.begin(), TransferStreams.begin() + End, + F)) + return false; + } + return true; + } + + template void forEachStream(T &&F) { + { + std::lock_guard compute_guard(ComputeStreamMutex); + unsigned int End = std::min( + static_cast(ComputeStreams.size()), NumComputeStreams); + for (unsigned int i = 0; i < End; i++) { + F(ComputeStreams[i]); + } + } + { + std::lock_guard transfer_guard(TransferStreamMutex); + unsigned int End = + std::min(static_cast(TransferStreams.size()), + NumTransferStreams); + for (unsigned int i = 0; i < End; i++) { + F(TransferStreams[i]); + } + } + } + + template void syncStreams(T &&F) { + auto SyncCompute = [&F, &Streams = ComputeStreams, &Delay = DelayCompute]( + unsigned int Start, unsigned int Stop) { + for (unsigned int i = Start; i < Stop; i++) { + F(Streams[i]); + Delay[i] = false; + } + }; + auto SyncTransfer = [&F, &streams = TransferStreams](unsigned int Start, + unsigned int Stop) { + for (unsigned int i = Start; i < Stop; i++) { + F(streams[i]); + } + }; + { + unsigned int Size = static_cast(ComputeStreams.size()); + std::lock_guard ComputeSyncGuard(ComputeStreamSyncMutex); + std::lock_guard ComputeGuard(ComputeStreamMutex); + unsigned int Start = LastSyncComputeStreams; + unsigned int End = NumComputeStreams < Size ? NumComputeStreams + : ComputeStreamIndex.load(); + if (ResetUsed) { + LastSyncComputeStreams = End; + } + if (End - Start >= Size) { + SyncCompute(0, Size); + } else { + Start %= Size; + End %= Size; + if (Start <= End) { + SyncCompute(Start, End); + } else { + SyncCompute(Start, Size); + SyncCompute(0, End); + } + } + } + { + unsigned int Size = static_cast(TransferStreams.size()); + if (!Size) { + return; + } + std::lock_guard TransferGuard(TransferStreamMutex); + unsigned int Start = LastSyncTransferStreams; + unsigned int End = NumTransferStreams < Size ? NumTransferStreams + : TransferStreamIndex.load(); + if (ResetUsed) { + LastSyncTransferStreams = End; + } + if (End - Start >= Size) { + SyncTransfer(0, Size); + } else { + Start %= Size; + End %= Size; + if (Start <= End) { + SyncTransfer(Start, End); + } else { + SyncTransfer(Start, Size); + SyncTransfer(0, End); + } + } + } + } + + ur_context_handle_t_ *getContext() const { return Context; }; + + ur_device_handle_t_ *get_device() const { return Device; }; + + uint32_t incrementReferenceCount() noexcept { return ++RefCount; } + + uint32_t decrementReferenceCount() noexcept { return --RefCount; } + + uint32_t getReferenceCount() const noexcept { return RefCount; } + + uint32_t getNextEventID() noexcept { return ++EventCount; } + + bool backendHasOwnership() const noexcept { return HasOwnership; } +}; diff --git a/source/adapters/cuda/sampler.cpp b/source/adapters/cuda/sampler.cpp new file mode 100644 index 0000000000..e561f4902b --- /dev/null +++ b/source/adapters/cuda/sampler.cpp @@ -0,0 +1,106 @@ +//===--------- sampler.cpp - CUDA Adapter ---------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sampler.hpp" +#include "common.hpp" + +UR_APIEXPORT ur_result_t UR_APICALL +urSamplerCreate(ur_context_handle_t hContext, const ur_sampler_desc_t *pDesc, + ur_sampler_handle_t *phSampler) { + std::unique_ptr Sampler{ + new ur_sampler_handle_t_(hContext)}; + + if (pDesc && pDesc->stype == UR_STRUCTURE_TYPE_SAMPLER_DESC) { + Sampler->Props |= pDesc->normalizedCoords; + Sampler->Props |= pDesc->filterMode << 1; + Sampler->Props |= pDesc->addressingMode << 2; + } else { + // Set default values + Sampler->Props |= true; // Normalized Coords + Sampler->Props |= UR_SAMPLER_ADDRESSING_MODE_CLAMP << 2; + } + + void *pNext = const_cast(pDesc->pNext); + while (pNext != nullptr) { + const ur_base_desc_t *BaseDesc = + reinterpret_cast(pNext); + if (BaseDesc->stype == UR_STRUCTURE_TYPE_EXP_SAMPLER_MIP_PROPERTIES) { + const ur_exp_sampler_mip_properties_t *SamplerMipProperties = + reinterpret_cast(pNext); + Sampler->MaxMipmapLevelClamp = SamplerMipProperties->maxMipmapLevelClamp; + Sampler->MinMipmapLevelClamp = SamplerMipProperties->minMipmapLevelClamp; + Sampler->MaxAnisotropy = SamplerMipProperties->maxAnisotropy; + Sampler->Props |= SamplerMipProperties->mipFilterMode << 5; + } + pNext = const_cast(BaseDesc->pNext); + } + + *phSampler = Sampler.release(); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urSamplerGetInfo(ur_sampler_handle_t hSampler, ur_sampler_info_t propName, + size_t propValueSize, void *pPropValue, size_t *pPropSizeRet) { + UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropSizeRet); + + switch (propName) { + case UR_SAMPLER_INFO_REFERENCE_COUNT: + return ReturnValue(hSampler->getReferenceCount()); + case UR_SAMPLER_INFO_CONTEXT: + return ReturnValue(hSampler->Context); + case UR_SAMPLER_INFO_NORMALIZED_COORDS: { + bool NormCoordsProp = hSampler->isNormalizedCoords(); + return ReturnValue(NormCoordsProp); + } + case UR_SAMPLER_INFO_FILTER_MODE: { + ur_sampler_filter_mode_t FilterProp = hSampler->getFilterMode(); + return ReturnValue(FilterProp); + } + case UR_SAMPLER_INFO_ADDRESSING_MODE: { + ur_sampler_addressing_mode_t AddressingProp = hSampler->getAddressingMode(); + return ReturnValue(AddressingProp); + } + default: + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + } + return {}; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urSamplerRetain(ur_sampler_handle_t hSampler) { + hSampler->incrementReferenceCount(); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urSamplerRelease(ur_sampler_handle_t hSampler) { + // double delete or someone is messing with the ref count. + // either way, cannot safely proceed. + detail::ur::assertion( + hSampler->getReferenceCount() != 0, + "Reference count overflow detected in urSamplerRelease."); + + // decrement ref count. If it is 0, delete the sampler. + if (hSampler->decrementReferenceCount() == 0) { + delete hSampler; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urSamplerGetNativeHandle(ur_sampler_handle_t, ur_native_handle_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urSamplerCreateWithNativeHandle( + ur_native_handle_t, ur_context_handle_t, + const ur_sampler_native_properties_t *, ur_sampler_handle_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/source/adapters/cuda/sampler.hpp b/source/adapters/cuda/sampler.hpp new file mode 100644 index 0000000000..8c362b98c9 --- /dev/null +++ b/source/adapters/cuda/sampler.hpp @@ -0,0 +1,54 @@ +//===--------- sampler.hpp - CUDA Adapter ---------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +/// Implementation of samplers for CUDA +/// +/// Sampler property layout: +/// | | +/// ----------------------------------- +/// | 31 30 ... 6 | N/A +/// | 5 | mip filter mode +/// | 4 3 2 | addressing mode +/// | 1 | filter mode +/// | 0 | normalize coords +struct ur_sampler_handle_t_ { + std::atomic_uint32_t RefCount; + uint32_t Props; + float MinMipmapLevelClamp; + float MaxMipmapLevelClamp; + float MaxAnisotropy; + ur_context_handle_t Context; + + ur_sampler_handle_t_(ur_context_handle_t Context) + : RefCount(1), Props(0), MinMipmapLevelClamp(0.0f), + MaxMipmapLevelClamp(0.0f), MaxAnisotropy(0.0f), Context(Context) {} + + uint32_t incrementReferenceCount() noexcept { return ++RefCount; } + + uint32_t decrementReferenceCount() noexcept { return --RefCount; } + + uint32_t getReferenceCount() const noexcept { return RefCount; } + + ur_bool_t isNormalizedCoords() const noexcept { + return static_cast(Props & 0b1); + } + + ur_sampler_filter_mode_t getFilterMode() const noexcept { + return static_cast((Props >> 1) & 0b1); + } + + ur_sampler_addressing_mode_t getAddressingMode() const noexcept { + return static_cast((Props >> 2) & 0b111); + } + + ur_sampler_filter_mode_t getMipFilterMode() const noexcept { + return static_cast((Props >> 5) & 0b1); + } +}; diff --git a/source/adapters/cuda/tracing.cpp b/source/adapters/cuda/tracing.cpp new file mode 100644 index 0000000000..9c0183960e --- /dev/null +++ b/source/adapters/cuda/tracing.cpp @@ -0,0 +1,109 @@ +//===-------------- tracing.cpp - CUDA Host API Tracing --------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifdef XPTI_ENABLE_INSTRUMENTATION +#include +#include +#endif + +#include +#ifdef XPTI_ENABLE_INSTRUMENTATION +#include +#endif // XPTI_ENABLE_INSTRUMENTATION + +#include +#include + +#ifdef XPTI_ENABLE_INSTRUMENTATION +constexpr auto CUDA_CALL_STREAM_NAME = "sycl.experimental.cuda.call"; +constexpr auto CUDA_DEBUG_STREAM_NAME = "sycl.experimental.cuda.debug"; + +thread_local uint64_t CallCorrelationID = 0; +thread_local uint64_t DebugCorrelationID = 0; + +static xpti_td *GCallEvent = nullptr; +static xpti_td *GDebugEvent = nullptr; + +constexpr auto GVerStr = "0.1"; +constexpr int GMajVer = 0; +constexpr int GMinVer = 1; + +static void cuptiCallback(void *, CUpti_CallbackDomain, CUpti_CallbackId CBID, + const void *CBData) { + if (xptiTraceEnabled()) { + const auto *CBInfo = static_cast(CBData); + + if (CBInfo->callbackSite == CUPTI_API_ENTER) { + CallCorrelationID = xptiGetUniqueId(); + DebugCorrelationID = xptiGetUniqueId(); + } + + const char *FuncName = CBInfo->functionName; + uint32_t FuncID = static_cast(CBID); + uint16_t TraceTypeArgs = CBInfo->callbackSite == CUPTI_API_ENTER + ? xpti::trace_function_with_args_begin + : xpti::trace_function_with_args_end; + uint16_t TraceType = CBInfo->callbackSite == CUPTI_API_ENTER + ? xpti::trace_function_begin + : xpti::trace_function_end; + + uint8_t CallStreamID = xptiRegisterStream(CUDA_CALL_STREAM_NAME); + uint8_t DebugStreamID = xptiRegisterStream(CUDA_DEBUG_STREAM_NAME); + + xptiNotifySubscribers(CallStreamID, TraceType, GCallEvent, nullptr, + CallCorrelationID, FuncName); + + xpti::function_with_args_t Payload{ + FuncID, FuncName, const_cast(CBInfo->functionParams), + CBInfo->functionReturnValue, CBInfo->context}; + xptiNotifySubscribers(DebugStreamID, TraceTypeArgs, GDebugEvent, nullptr, + DebugCorrelationID, &Payload); + } +} +#endif + +void enableCUDATracing() { +#ifdef XPTI_ENABLE_INSTRUMENTATION + if (!xptiTraceEnabled()) + return; + + xptiRegisterStream(CUDA_CALL_STREAM_NAME); + xptiInitialize(CUDA_CALL_STREAM_NAME, GMajVer, GMinVer, GVerStr); + xptiRegisterStream(CUDA_DEBUG_STREAM_NAME); + xptiInitialize(CUDA_DEBUG_STREAM_NAME, GMajVer, GMinVer, GVerStr); + + uint64_t Dummy; + xpti::payload_t CUDAPayload("CUDA Plugin Layer"); + GCallEvent = + xptiMakeEvent("CUDA Plugin Layer", &CUDAPayload, + xpti::trace_algorithm_event, xpti_at::active, &Dummy); + + xpti::payload_t CUDADebugPayload("CUDA Plugin Debug Layer"); + GDebugEvent = + xptiMakeEvent("CUDA Plugin Debug Layer", &CUDADebugPayload, + xpti::trace_algorithm_event, xpti_at::active, &Dummy); + + CUpti_SubscriberHandle Subscriber; + cuptiSubscribe(&Subscriber, cuptiCallback, nullptr); + cuptiEnableDomain(1, Subscriber, CUPTI_CB_DOMAIN_DRIVER_API); + cuptiEnableCallback(0, Subscriber, CUPTI_CB_DOMAIN_DRIVER_API, + CUPTI_DRIVER_TRACE_CBID_cuGetErrorString); + cuptiEnableCallback(0, Subscriber, CUPTI_CB_DOMAIN_DRIVER_API, + CUPTI_DRIVER_TRACE_CBID_cuGetErrorName); +#endif +} + +void disableCUDATracing() { +#ifdef XPTI_ENABLE_INSTRUMENTATION + if (!xptiTraceEnabled()) + return; + + xptiFinalize(CUDA_CALL_STREAM_NAME); + xptiFinalize(CUDA_DEBUG_STREAM_NAME); +#endif // XPTI_ENABLE_INSTRUMENTATION +} diff --git a/source/adapters/cuda/ur_interface_loader.cpp b/source/adapters/cuda/ur_interface_loader.cpp new file mode 100644 index 0000000000..73eace5818 --- /dev/null +++ b/source/adapters/cuda/ur_interface_loader.cpp @@ -0,0 +1,355 @@ +//===--------- ur_interface_loader.cpp - Unified Runtime -----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include + +namespace { + +// TODO - this is a duplicate of what is in the L0 plugin +// We should move this to somewhere common +ur_result_t validateProcInputs(ur_api_version_t version, void *pDdiTable) { + if (pDdiTable == nullptr) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + // Pre 1.0 we enforce that loader and adapter must have the same version. + // Post 1.0 only a major version match should be required. + if (version != UR_API_VERSION_CURRENT) { + return UR_RESULT_ERROR_UNSUPPORTED_VERSION; + } + return UR_RESULT_SUCCESS; +} +} // namespace + +#if defined(__cplusplus) +extern "C" { +#endif + +UR_DLLEXPORT ur_result_t UR_APICALL urGetPlatformProcAddrTable( + ur_api_version_t version, ur_platform_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnGet = urPlatformGet; + pDdiTable->pfnGetApiVersion = urPlatformGetApiVersion; + pDdiTable->pfnGetInfo = urPlatformGetInfo; + pDdiTable->pfnGetNativeHandle = urPlatformGetNativeHandle; + pDdiTable->pfnGetBackendOption = urPlatformGetBackendOption; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetContextProcAddrTable( + ur_api_version_t version, ur_context_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreate = urContextCreate; + pDdiTable->pfnCreateWithNativeHandle = urContextCreateWithNativeHandle; + pDdiTable->pfnGetInfo = urContextGetInfo; + pDdiTable->pfnGetNativeHandle = urContextGetNativeHandle; + pDdiTable->pfnRelease = urContextRelease; + pDdiTable->pfnRetain = urContextRetain; + pDdiTable->pfnSetExtendedDeleter = urContextSetExtendedDeleter; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetEventProcAddrTable( + ur_api_version_t version, ur_event_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreateWithNativeHandle = urEventCreateWithNativeHandle; + pDdiTable->pfnGetInfo = urEventGetInfo; + pDdiTable->pfnGetNativeHandle = urEventGetNativeHandle; + pDdiTable->pfnGetProfilingInfo = urEventGetProfilingInfo; + pDdiTable->pfnRelease = urEventRelease; + pDdiTable->pfnRetain = urEventRetain; + pDdiTable->pfnSetCallback = urEventSetCallback; + pDdiTable->pfnWait = urEventWait; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramProcAddrTable( + ur_api_version_t version, ur_program_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnBuild = urProgramBuild; + pDdiTable->pfnCompile = urProgramCompile; + pDdiTable->pfnCreateWithBinary = urProgramCreateWithBinary; + pDdiTable->pfnCreateWithIL = urProgramCreateWithIL; + pDdiTable->pfnCreateWithNativeHandle = urProgramCreateWithNativeHandle; + pDdiTable->pfnGetBuildInfo = urProgramGetBuildInfo; + pDdiTable->pfnGetFunctionPointer = urProgramGetFunctionPointer; + pDdiTable->pfnGetInfo = urProgramGetInfo; + pDdiTable->pfnGetNativeHandle = urProgramGetNativeHandle; + pDdiTable->pfnLink = urProgramLink; + pDdiTable->pfnRelease = urProgramRelease; + pDdiTable->pfnRetain = urProgramRetain; + pDdiTable->pfnSetSpecializationConstants = + urProgramSetSpecializationConstants; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( + ur_api_version_t version, ur_kernel_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreate = urKernelCreate; + pDdiTable->pfnCreateWithNativeHandle = urKernelCreateWithNativeHandle; + pDdiTable->pfnGetGroupInfo = urKernelGetGroupInfo; + pDdiTable->pfnGetInfo = urKernelGetInfo; + pDdiTable->pfnGetNativeHandle = urKernelGetNativeHandle; + pDdiTable->pfnGetSubGroupInfo = urKernelGetSubGroupInfo; + pDdiTable->pfnRelease = urKernelRelease; + pDdiTable->pfnRetain = urKernelRetain; + pDdiTable->pfnSetArgLocal = urKernelSetArgLocal; + pDdiTable->pfnSetArgMemObj = urKernelSetArgMemObj; + pDdiTable->pfnSetArgPointer = urKernelSetArgPointer; + pDdiTable->pfnSetArgSampler = urKernelSetArgSampler; + pDdiTable->pfnSetArgValue = urKernelSetArgValue; + pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; + pDdiTable->pfnSetSpecializationConstants = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetSamplerProcAddrTable( + ur_api_version_t version, ur_sampler_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreate = urSamplerCreate; + pDdiTable->pfnCreateWithNativeHandle = urSamplerCreateWithNativeHandle; + pDdiTable->pfnGetInfo = urSamplerGetInfo; + pDdiTable->pfnGetNativeHandle = urSamplerGetNativeHandle; + pDdiTable->pfnRelease = urSamplerRelease; + pDdiTable->pfnRetain = urSamplerRetain; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL +urGetMemProcAddrTable(ur_api_version_t version, ur_mem_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnBufferCreate = urMemBufferCreate; + pDdiTable->pfnBufferPartition = urMemBufferPartition; + pDdiTable->pfnBufferCreateWithNativeHandle = + urMemBufferCreateWithNativeHandle; + pDdiTable->pfnImageCreateWithNativeHandle = urMemImageCreateWithNativeHandle; + pDdiTable->pfnGetInfo = urMemGetInfo; + pDdiTable->pfnGetNativeHandle = urMemGetNativeHandle; + pDdiTable->pfnImageCreate = urMemImageCreate; + pDdiTable->pfnImageGetInfo = urMemImageGetInfo; + pDdiTable->pfnRelease = urMemRelease; + pDdiTable->pfnRetain = urMemRetain; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable( + ur_api_version_t version, ur_enqueue_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnDeviceGlobalVariableRead = urEnqueueDeviceGlobalVariableRead; + pDdiTable->pfnDeviceGlobalVariableWrite = urEnqueueDeviceGlobalVariableWrite; + pDdiTable->pfnEventsWait = urEnqueueEventsWait; + pDdiTable->pfnEventsWaitWithBarrier = urEnqueueEventsWaitWithBarrier; + pDdiTable->pfnKernelLaunch = urEnqueueKernelLaunch; + pDdiTable->pfnMemBufferCopy = urEnqueueMemBufferCopy; + pDdiTable->pfnMemBufferCopyRect = urEnqueueMemBufferCopyRect; + pDdiTable->pfnMemBufferFill = urEnqueueMemBufferFill; + pDdiTable->pfnMemBufferMap = urEnqueueMemBufferMap; + pDdiTable->pfnMemBufferRead = urEnqueueMemBufferRead; + pDdiTable->pfnMemBufferReadRect = urEnqueueMemBufferReadRect; + pDdiTable->pfnMemBufferWrite = urEnqueueMemBufferWrite; + pDdiTable->pfnMemBufferWriteRect = urEnqueueMemBufferWriteRect; + pDdiTable->pfnMemImageCopy = urEnqueueMemImageCopy; + pDdiTable->pfnMemImageRead = urEnqueueMemImageRead; + pDdiTable->pfnMemImageWrite = urEnqueueMemImageWrite; + pDdiTable->pfnMemUnmap = urEnqueueMemUnmap; + pDdiTable->pfnUSMFill2D = urEnqueueUSMFill2D; + pDdiTable->pfnUSMFill = urEnqueueUSMFill; + pDdiTable->pfnUSMAdvise = urEnqueueUSMAdvise; + pDdiTable->pfnUSMMemcpy2D = urEnqueueUSMMemcpy2D; + pDdiTable->pfnUSMMemcpy = urEnqueueUSMMemcpy; + pDdiTable->pfnUSMPrefetch = urEnqueueUSMPrefetch; + pDdiTable->pfnReadHostPipe = urEnqueueReadHostPipe; + pDdiTable->pfnWriteHostPipe = urEnqueueWriteHostPipe; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetGlobalProcAddrTable( + ur_api_version_t version, ur_global_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnInit = urInit; + pDdiTable->pfnTearDown = urTearDown; + pDdiTable->pfnAdapterGet = urAdapterGet; + pDdiTable->pfnAdapterRelease = urAdapterRelease; + pDdiTable->pfnAdapterRetain = urAdapterRetain; + pDdiTable->pfnAdapterGetLastError = urAdapterGetLastError; + pDdiTable->pfnAdapterGetInfo = urAdapterGetInfo; + + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( + ur_api_version_t version, ur_queue_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreate = urQueueCreate; + pDdiTable->pfnCreateWithNativeHandle = urQueueCreateWithNativeHandle; + pDdiTable->pfnFinish = urQueueFinish; + pDdiTable->pfnFlush = urQueueFlush; + pDdiTable->pfnGetInfo = urQueueGetInfo; + pDdiTable->pfnGetNativeHandle = urQueueGetNativeHandle; + pDdiTable->pfnRelease = urQueueRelease; + pDdiTable->pfnRetain = urQueueRetain; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL +urGetUSMProcAddrTable(ur_api_version_t version, ur_usm_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnDeviceAlloc = urUSMDeviceAlloc; + pDdiTable->pfnFree = urUSMFree; + pDdiTable->pfnGetMemAllocInfo = urUSMGetMemAllocInfo; + pDdiTable->pfnHostAlloc = urUSMHostAlloc; + pDdiTable->pfnPoolCreate = urUSMPoolCreate; + pDdiTable->pfnPoolRetain = urUSMPoolRetain; + pDdiTable->pfnPoolRelease = urUSMPoolRelease; + pDdiTable->pfnPoolGetInfo = urUSMPoolGetInfo; + pDdiTable->pfnSharedAlloc = urUSMSharedAlloc; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable( + ur_api_version_t version, ur_device_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreateWithNativeHandle = urDeviceCreateWithNativeHandle; + pDdiTable->pfnGet = urDeviceGet; + pDdiTable->pfnGetGlobalTimestamps = urDeviceGetGlobalTimestamps; + pDdiTable->pfnGetInfo = urDeviceGetInfo; + pDdiTable->pfnGetNativeHandle = urDeviceGetNativeHandle; + pDdiTable->pfnPartition = urDevicePartition; + pDdiTable->pfnRelease = urDeviceRelease; + pDdiTable->pfnRetain = urDeviceRetain; + pDdiTable->pfnSelectBinary = urDeviceSelectBinary; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( + ur_api_version_t version, ///< [in] API version requested + ur_command_buffer_exp_dditable_t + *pDdiTable ///< [in,out] pointer to table of DDI function pointers +) { + auto retVal = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != retVal) { + return retVal; + } + pDdiTable->pfnCreateExp = urCommandBufferCreateExp; + pDdiTable->pfnRetainExp = urCommandBufferRetainExp; + pDdiTable->pfnReleaseExp = urCommandBufferReleaseExp; + pDdiTable->pfnFinalizeExp = urCommandBufferFinalizeExp; + pDdiTable->pfnAppendKernelLaunchExp = urCommandBufferAppendKernelLaunchExp; + pDdiTable->pfnAppendMemcpyUSMExp = urCommandBufferAppendMemcpyUSMExp; + pDdiTable->pfnAppendMembufferCopyExp = urCommandBufferAppendMembufferCopyExp; + pDdiTable->pfnAppendMembufferCopyRectExp = + urCommandBufferAppendMembufferCopyRectExp; + pDdiTable->pfnAppendMembufferReadExp = urCommandBufferAppendMembufferReadExp; + pDdiTable->pfnAppendMembufferReadRectExp = + urCommandBufferAppendMembufferReadRectExp; + pDdiTable->pfnAppendMembufferWriteExp = + urCommandBufferAppendMembufferWriteExp; + pDdiTable->pfnAppendMembufferWriteRectExp = + urCommandBufferAppendMembufferWriteRectExp; + pDdiTable->pfnEnqueueExp = urCommandBufferEnqueueExp; + + return retVal; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetUsmP2PExpProcAddrTable( + ur_api_version_t version, ur_usm_p2p_exp_dditable_t *pDdiTable) { + auto retVal = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != retVal) { + return retVal; + } + pDdiTable->pfnEnablePeerAccessExp = urUsmP2PEnablePeerAccessExp; + pDdiTable->pfnDisablePeerAccessExp = urUsmP2PDisablePeerAccessExp; + pDdiTable->pfnPeerAccessGetInfoExp = urUsmP2PPeerAccessGetInfoExp; + + return retVal; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable( + ur_api_version_t version, ur_bindless_images_exp_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnUnsampledImageHandleDestroyExp = + urBindlessImagesUnsampledImageHandleDestroyExp; + pDdiTable->pfnSampledImageHandleDestroyExp = + urBindlessImagesSampledImageHandleDestroyExp; + pDdiTable->pfnImageAllocateExp = urBindlessImagesImageAllocateExp; + pDdiTable->pfnImageFreeExp = urBindlessImagesImageFreeExp; + pDdiTable->pfnUnsampledImageCreateExp = + urBindlessImagesUnsampledImageCreateExp; + pDdiTable->pfnSampledImageCreateExp = urBindlessImagesSampledImageCreateExp; + pDdiTable->pfnImageCopyExp = urBindlessImagesImageCopyExp; + pDdiTable->pfnImageGetInfoExp = urBindlessImagesImageGetInfoExp; + pDdiTable->pfnMipmapGetLevelExp = urBindlessImagesMipmapGetLevelExp; + pDdiTable->pfnMipmapFreeExp = urBindlessImagesMipmapFreeExp; + pDdiTable->pfnImportOpaqueFDExp = urBindlessImagesImportOpaqueFDExp; + pDdiTable->pfnMapExternalArrayExp = urBindlessImagesMapExternalArrayExp; + pDdiTable->pfnReleaseInteropExp = urBindlessImagesReleaseInteropExp; + pDdiTable->pfnImportExternalSemaphoreOpaqueFDExp = + urBindlessImagesImportExternalSemaphoreOpaqueFDExp; + pDdiTable->pfnDestroyExternalSemaphoreExp = + urBindlessImagesDestroyExternalSemaphoreExp; + pDdiTable->pfnWaitExternalSemaphoreExp = + urBindlessImagesWaitExternalSemaphoreExp; + pDdiTable->pfnSignalExternalSemaphoreExp = + urBindlessImagesSignalExternalSemaphoreExp; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetUSMExpProcAddrTable( + ur_api_version_t version, ur_usm_exp_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnPitchedAllocExp = urUSMPitchedAllocExp; + return UR_RESULT_SUCCESS; +} + +#if defined(__cplusplus) +} // extern "C" +#endif diff --git a/source/adapters/cuda/usm.cpp b/source/adapters/cuda/usm.cpp new file mode 100644 index 0000000000..d272a836e6 --- /dev/null +++ b/source/adapters/cuda/usm.cpp @@ -0,0 +1,503 @@ +//===--------- usm.cpp - CUDA Adapter -------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#include "adapter.hpp" +#include "common.hpp" +#include "context.hpp" +#include "device.hpp" +#include "event.hpp" +#include "platform.hpp" +#include "queue.hpp" +#include "usm.hpp" + +#include + +/// USM: Implements USM Host allocations using CUDA Pinned Memory +/// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#page-locked-host-memory +UR_APIEXPORT ur_result_t UR_APICALL +urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc, + ur_usm_pool_handle_t hPool, size_t size, void **ppMem) { + auto alignment = pUSMDesc ? pUSMDesc->align : 0u; + UR_ASSERT(!pUSMDesc || + (alignment == 0 || ((alignment & (alignment - 1)) == 0)), + UR_RESULT_ERROR_INVALID_VALUE); + + if (!hPool) { + return USMHostAllocImpl(ppMem, hContext, nullptr, size, alignment); + } + + auto UMFPool = hPool->HostMemPool.get(); + *ppMem = umfPoolAlignedMalloc(UMFPool, size, alignment); + if (*ppMem == nullptr) { + auto umfErr = umfPoolGetLastAllocationError(UMFPool); + return umf::umf2urResult(umfErr); + } + return UR_RESULT_SUCCESS; +} + +/// USM: Implements USM device allocations using a normal CUDA device pointer +/// +UR_APIEXPORT ur_result_t UR_APICALL +urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, + const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t hPool, + size_t size, void **ppMem) { + auto alignment = pUSMDesc ? pUSMDesc->align : 0u; + UR_ASSERT(!pUSMDesc || + (alignment == 0 || ((alignment & (alignment - 1)) == 0)), + UR_RESULT_ERROR_INVALID_VALUE); + + if (!hPool) { + return USMDeviceAllocImpl(ppMem, hContext, hDevice, nullptr, size, + alignment); + } + + auto UMFPool = hPool->DeviceMemPool.get(); + *ppMem = umfPoolAlignedMalloc(UMFPool, size, alignment); + if (*ppMem == nullptr) { + auto umfErr = umfPoolGetLastAllocationError(UMFPool); + return umf::umf2urResult(umfErr); + } + return UR_RESULT_SUCCESS; +} + +/// USM: Implements USM Shared allocations using CUDA Managed Memory +/// +UR_APIEXPORT ur_result_t UR_APICALL +urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, + const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t hPool, + size_t size, void **ppMem) { + auto alignment = pUSMDesc ? pUSMDesc->align : 0u; + UR_ASSERT(!pUSMDesc || + (alignment == 0 || ((alignment & (alignment - 1)) == 0)), + UR_RESULT_ERROR_INVALID_VALUE); + + if (!hPool) { + return USMSharedAllocImpl(ppMem, hContext, hDevice, nullptr, nullptr, size, + alignment); + } + + auto UMFPool = hPool->SharedMemPool.get(); + *ppMem = umfPoolAlignedMalloc(UMFPool, size, alignment); + if (*ppMem == nullptr) { + auto umfErr = umfPoolGetLastAllocationError(UMFPool); + return umf::umf2urResult(umfErr); + } + return UR_RESULT_SUCCESS; +} + +ur_result_t USMFreeImpl(ur_context_handle_t Context, void *Pointer) { + ur_result_t Result = UR_RESULT_SUCCESS; + try { + ScopedContext Active(Context); + bool IsManaged; + unsigned int Type; + void *AttributeValues[2] = {&IsManaged, &Type}; + CUpointer_attribute Attributes[2] = {CU_POINTER_ATTRIBUTE_IS_MANAGED, + CU_POINTER_ATTRIBUTE_MEMORY_TYPE}; + UR_CHECK_ERROR(cuPointerGetAttributes(2, Attributes, AttributeValues, + (CUdeviceptr)Pointer)); + UR_ASSERT(Type == CU_MEMORYTYPE_DEVICE || Type == CU_MEMORYTYPE_HOST, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + if (IsManaged || Type == CU_MEMORYTYPE_DEVICE) { + // Memory allocated with cuMemAlloc and cuMemAllocManaged must be freed + // with cuMemFree + UR_CHECK_ERROR(cuMemFree((CUdeviceptr)Pointer)); + } else { + // Memory allocated with cuMemAllocHost must be freed with cuMemFreeHost + UR_CHECK_ERROR(cuMemFreeHost(Pointer)); + } + } catch (ur_result_t Err) { + Result = Err; + } + return Result; +} + +/// USM: Frees the given USM pointer associated with the context. +/// +UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext, + void *pMem) { + if (auto Pool = umfPoolByPtr(pMem)) + return umf::umf2urResult(umfPoolFree(Pool, pMem)); + return USMFreeImpl(hContext, pMem); +} + +ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, + ur_device_handle_t, ur_usm_device_mem_flags_t *, + size_t Size, uint32_t Alignment) { + try { + ScopedContext Active(Context); + UR_CHECK_ERROR(cuMemAlloc((CUdeviceptr *)ResultPtr, Size)); + } catch (ur_result_t Err) { + return Err; + } + +#ifdef NDEBUG + std::ignore = Alignment; +#else + assert((Alignment == 0 || + reinterpret_cast(*ResultPtr) % Alignment == 0)); +#endif + return UR_RESULT_SUCCESS; +} + +ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context, + ur_device_handle_t, ur_usm_host_mem_flags_t *, + ur_usm_device_mem_flags_t *, size_t Size, + uint32_t Alignment) { + try { + ScopedContext Active(Context); + UR_CHECK_ERROR(cuMemAllocManaged((CUdeviceptr *)ResultPtr, Size, + CU_MEM_ATTACH_GLOBAL)); + } catch (ur_result_t Err) { + return Err; + } + +#ifdef NDEBUG + std::ignore = Alignment; +#else + assert((Alignment == 0 || + reinterpret_cast(*ResultPtr) % Alignment == 0)); +#endif + return UR_RESULT_SUCCESS; +} + +ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context, + ur_usm_host_mem_flags_t *, size_t Size, + uint32_t Alignment) { + try { + ScopedContext Active(Context); + UR_CHECK_ERROR(cuMemAllocHost(ResultPtr, Size)); + } catch (ur_result_t Err) { + return Err; + } + +#ifdef NDEBUG + std::ignore = Alignment; +#else + assert((Alignment == 0 || + reinterpret_cast(*ResultPtr) % Alignment == 0)); +#endif + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem, + ur_usm_alloc_info_t propName, size_t propValueSize, + void *pPropValue, size_t *pPropValueSizeRet) { + ur_result_t Result = UR_RESULT_SUCCESS; + + UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet); + + try { + ScopedContext Active(hContext); + switch (propName) { + case UR_USM_ALLOC_INFO_TYPE: { + unsigned int Value; + // do not throw if cuPointerGetAttribute returns CUDA_ERROR_INVALID_VALUE + CUresult Ret = cuPointerGetAttribute( + &Value, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem); + if (Ret == CUDA_ERROR_INVALID_VALUE) { + // pointer not known to the CUDA subsystem + return ReturnValue(UR_USM_TYPE_UNKNOWN); + } + checkErrorUR(Ret, __func__, __LINE__ - 5, __FILE__); + if (Value) { + // pointer to managed memory + return ReturnValue(UR_USM_TYPE_SHARED); + } + UR_CHECK_ERROR(cuPointerGetAttribute( + &Value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)pMem)); + UR_ASSERT(Value == CU_MEMORYTYPE_DEVICE || Value == CU_MEMORYTYPE_HOST, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + if (Value == CU_MEMORYTYPE_DEVICE) { + // pointer to device memory + return ReturnValue(UR_USM_TYPE_DEVICE); + } + if (Value == CU_MEMORYTYPE_HOST) { + // pointer to host memory + return ReturnValue(UR_USM_TYPE_HOST); + } + // should never get here +#ifdef _MSC_VER + __assume(0); +#else + __builtin_unreachable(); +#endif + } + case UR_USM_ALLOC_INFO_BASE_PTR: { +#if CUDA_VERSION >= 10020 + // CU_POINTER_ATTRIBUTE_RANGE_START_ADDR was introduced in CUDA 10.2 + void *Base; + UR_CHECK_ERROR(cuPointerGetAttribute( + &Base, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, (CUdeviceptr)pMem)); + return ReturnValue(Base); +#else + return UR_RESULT_ERROR_INVALID_VALUE; +#endif + } + case UR_USM_ALLOC_INFO_SIZE: { +#if CUDA_VERSION >= 10020 + // CU_POINTER_ATTRIBUTE_RANGE_SIZE was introduced in CUDA 10.2 + size_t Value; + UR_CHECK_ERROR(cuPointerGetAttribute( + &Value, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem)); + return ReturnValue(Value); +#else + return UR_RESULT_ERROR_INVALID_VALUE; +#endif + } + case UR_USM_ALLOC_INFO_DEVICE: { + // get device index associated with this pointer + unsigned int DeviceIndex; + UR_CHECK_ERROR(cuPointerGetAttribute(&DeviceIndex, + CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, + (CUdeviceptr)pMem)); + + // currently each device is in its own platform, so find the platform at + // the same index + std::vector Platforms; + Platforms.resize(DeviceIndex + 1); + ur_adapter_handle_t AdapterHandle = &adapter; + Result = urPlatformGet(&AdapterHandle, 1, DeviceIndex + 1, + Platforms.data(), nullptr); + + // get the device from the platform + ur_device_handle_t Device = Platforms[DeviceIndex]->Devices[0].get(); + return ReturnValue(Device); + } + case UR_USM_ALLOC_INFO_POOL: { + auto UMFPool = umfPoolByPtr(pMem); + if (!UMFPool) { + return UR_RESULT_ERROR_INVALID_VALUE; + } + ur_usm_pool_handle_t Pool = hContext->getOwningURPool(UMFPool); + if (!Pool) { + return UR_RESULT_ERROR_INVALID_VALUE; + } + return ReturnValue(Pool); + } + default: + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + } + } catch (ur_result_t Err) { + Result = Err; + } + return Result; +} + +UR_APIEXPORT ur_result_t UR_APICALL urUSMImportExp(ur_context_handle_t Context, + void *HostPtr, size_t Size) { + UR_ASSERT(Context, UR_RESULT_ERROR_INVALID_CONTEXT); + UR_ASSERT(!HostPtr, UR_RESULT_ERROR_INVALID_VALUE); + UR_ASSERT(Size > 0, UR_RESULT_ERROR_INVALID_VALUE); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urUSMReleaseExp(ur_context_handle_t Context, + void *HostPtr) { + UR_ASSERT(Context, UR_RESULT_ERROR_INVALID_CONTEXT); + UR_ASSERT(!HostPtr, UR_RESULT_ERROR_INVALID_VALUE); + return UR_RESULT_SUCCESS; +} + +umf_result_t USMMemoryProvider::initialize(ur_context_handle_t Ctx, + ur_device_handle_t Dev) { + Context = Ctx; + Device = Dev; + // There isn't a way to query this in cuda, and there isn't much info on + // cuda's approach to alignment or transfer granularity between host and + // device. Within UMF this is only used to influence alignment, and since we + // discard that in our alloc implementations it seems we can safely ignore + // this as well, for now. + MinPageSize = 0; + + return UMF_RESULT_SUCCESS; +} + +enum umf_result_t USMMemoryProvider::alloc(size_t Size, size_t Align, + void **Ptr) { + auto Res = allocateImpl(Ptr, Size, Align); + if (Res != UR_RESULT_SUCCESS) { + getLastStatusRef() = Res; + return UMF_RESULT_ERROR_MEMORY_PROVIDER_SPECIFIC; + } + + return UMF_RESULT_SUCCESS; +} + +enum umf_result_t USMMemoryProvider::free(void *Ptr, size_t Size) { + (void)Size; + + auto Res = USMFreeImpl(Context, Ptr); + if (Res != UR_RESULT_SUCCESS) { + getLastStatusRef() = Res; + return UMF_RESULT_ERROR_MEMORY_PROVIDER_SPECIFIC; + } + + return UMF_RESULT_SUCCESS; +} + +void USMMemoryProvider::get_last_native_error(const char **ErrMsg, + int32_t *ErrCode) { + (void)ErrMsg; + *ErrCode = static_cast(getLastStatusRef()); +} + +umf_result_t USMMemoryProvider::get_min_page_size(void *Ptr, size_t *PageSize) { + (void)Ptr; + *PageSize = MinPageSize; + + return UMF_RESULT_SUCCESS; +} + +ur_result_t USMSharedMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, + uint32_t Alignment) { + return USMSharedAllocImpl(ResultPtr, Context, Device, nullptr, nullptr, Size, + Alignment); +} + +ur_result_t USMDeviceMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, + uint32_t Alignment) { + return USMDeviceAllocImpl(ResultPtr, Context, Device, nullptr, Size, + Alignment); +} + +ur_result_t USMHostMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, + uint32_t Alignment) { + return USMHostAllocImpl(ResultPtr, Context, nullptr, Size, Alignment); +} + +ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context, + ur_usm_pool_desc_t *PoolDesc) + : Context(Context) { + const void *pNext = PoolDesc->pNext; + while (pNext != nullptr) { + const ur_base_desc_t *BaseDesc = static_cast(pNext); + switch (BaseDesc->stype) { + case UR_STRUCTURE_TYPE_USM_POOL_LIMITS_DESC: { + const ur_usm_pool_limits_desc_t *Limits = + reinterpret_cast(BaseDesc); + for (auto &config : DisjointPoolConfigs.Configs) { + config.MaxPoolableSize = Limits->maxPoolableSize; + config.SlabMinSize = Limits->minDriverAllocSize; + } + break; + } + default: { + throw UsmAllocationException(UR_RESULT_ERROR_INVALID_ARGUMENT); + } + } + pNext = BaseDesc->pNext; + } + + auto MemProvider = + umf::memoryProviderMakeUnique(Context, nullptr) + .second; + + HostMemPool = + umf::poolMakeUnique( + {std::move(MemProvider)}, + this->DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Host]) + .second; + + auto Device = Context->DeviceID; + MemProvider = + umf::memoryProviderMakeUnique(Context, Device) + .second; + DeviceMemPool = + umf::poolMakeUnique( + {std::move(MemProvider)}, + this->DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Device]) + .second; + + MemProvider = + umf::memoryProviderMakeUnique(Context, Device) + .second; + SharedMemPool = + umf::poolMakeUnique( + {std::move(MemProvider)}, + this->DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Shared]) + .second; + Context->addPool(this); +} + +bool ur_usm_pool_handle_t_::hasUMFPool(umf_memory_pool_t *umf_pool) { + return DeviceMemPool.get() == umf_pool || SharedMemPool.get() == umf_pool || + HostMemPool.get() == umf_pool; +} + +UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolCreate( + ur_context_handle_t Context, ///< [in] handle of the context object + ur_usm_pool_desc_t + *PoolDesc, ///< [in] pointer to USM pool descriptor. Can be chained with + ///< ::ur_usm_pool_limits_desc_t + ur_usm_pool_handle_t *Pool ///< [out] pointer to USM memory pool +) { + // Without pool tracking we can't free pool allocations. +#ifdef UMF_ENABLE_POOL_TRACKING + if (PoolDesc->flags & UR_USM_POOL_FLAG_ZERO_INITIALIZE_BLOCK) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + try { + *Pool = reinterpret_cast( + new ur_usm_pool_handle_t_(Context, PoolDesc)); + } catch (const UsmAllocationException &Ex) { + return Ex.getError(); + } + return UR_RESULT_SUCCESS; +#else + std::ignore = Context; + std::ignore = PoolDesc; + std::ignore = Pool; + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +#endif +} + +UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolRetain( + ur_usm_pool_handle_t Pool ///< [in] pointer to USM memory pool +) { + Pool->incrementReferenceCount(); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolRelease( + ur_usm_pool_handle_t Pool ///< [in] pointer to USM memory pool +) { + if (Pool->decrementReferenceCount() > 0) { + return UR_RESULT_SUCCESS; + } + Pool->Context->removePool(Pool); + delete Pool; + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolGetInfo( + ur_usm_pool_handle_t hPool, ///< [in] handle of the USM memory pool + ur_usm_pool_info_t propName, ///< [in] name of the pool property to query + size_t propSize, ///< [in] size in bytes of the pool property value provided + void *pPropValue, ///< [out][optional][typename(propName, propSize)] value + ///< of the pool property + size_t *pPropSizeRet ///< [out][optional] size in bytes returned in pool + ///< property value +) { + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + switch (propName) { + case UR_USM_POOL_INFO_REFERENCE_COUNT: { + return ReturnValue(hPool->getReferenceCount()); + } + case UR_USM_POOL_INFO_CONTEXT: { + return ReturnValue(hPool->Context); + } + default: { + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + } + } +} diff --git a/source/adapters/cuda/usm.hpp b/source/adapters/cuda/usm.hpp new file mode 100644 index 0000000000..d4cfba7641 --- /dev/null +++ b/source/adapters/cuda/usm.hpp @@ -0,0 +1,130 @@ +//===--------- usm.hpp - CUDA Adapter -------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "common.hpp" + +#include +#include + +usm::DisjointPoolAllConfigs InitializeDisjointPoolConfig(); + +struct ur_usm_pool_handle_t_ { + std::atomic_uint32_t RefCount = 1; + + ur_context_handle_t Context = nullptr; + + usm::DisjointPoolAllConfigs DisjointPoolConfigs = + usm::DisjointPoolAllConfigs(); + + umf::pool_unique_handle_t DeviceMemPool; + umf::pool_unique_handle_t SharedMemPool; + umf::pool_unique_handle_t HostMemPool; + + ur_usm_pool_handle_t_(ur_context_handle_t Context, + ur_usm_pool_desc_t *PoolDesc); + + uint32_t incrementReferenceCount() noexcept { return ++RefCount; } + + uint32_t decrementReferenceCount() noexcept { return --RefCount; } + + uint32_t getReferenceCount() const noexcept { return RefCount; } + + bool hasUMFPool(umf_memory_pool_t *umf_pool); +}; + +// Exception type to pass allocation errors +class UsmAllocationException { + const ur_result_t Error; + +public: + UsmAllocationException(ur_result_t Err) : Error{Err} {} + ur_result_t getError() const { return Error; } +}; + +// Implements memory allocation via driver API for USM allocator interface. +class USMMemoryProvider { +private: + ur_result_t &getLastStatusRef() { + static thread_local ur_result_t LastStatus = UR_RESULT_SUCCESS; + return LastStatus; + } + +protected: + ur_context_handle_t Context; + ur_device_handle_t Device; + size_t MinPageSize; + + // Internal allocation routine which must be implemented for each allocation + // type + virtual ur_result_t allocateImpl(void **ResultPtr, size_t Size, + uint32_t Alignment) = 0; + +public: + umf_result_t initialize(ur_context_handle_t Ctx, ur_device_handle_t Dev); + umf_result_t alloc(size_t Size, size_t Align, void **Ptr); + umf_result_t free(void *Ptr, size_t Size); + void get_last_native_error(const char **ErrMsg, int32_t *ErrCode); + umf_result_t get_min_page_size(void *, size_t *); + umf_result_t get_recommended_page_size(size_t, size_t *) { + return UMF_RESULT_ERROR_NOT_SUPPORTED; + }; + umf_result_t purge_lazy(void *, size_t) { + return UMF_RESULT_ERROR_NOT_SUPPORTED; + }; + umf_result_t purge_force(void *, size_t) { + return UMF_RESULT_ERROR_NOT_SUPPORTED; + }; + virtual const char *get_name() = 0; + + virtual ~USMMemoryProvider() = default; +}; + +// Allocation routines for shared memory type +class USMSharedMemoryProvider final : public USMMemoryProvider { +public: + const char *get_name() override { return "USMSharedMemoryProvider"; } + +protected: + ur_result_t allocateImpl(void **ResultPtr, size_t Size, + uint32_t Alignment) override; +}; + +// Allocation routines for device memory type +class USMDeviceMemoryProvider final : public USMMemoryProvider { +public: + const char *get_name() override { return "USMSharedMemoryProvider"; } + +protected: + ur_result_t allocateImpl(void **ResultPtr, size_t Size, + uint32_t Alignment) override; +}; + +// Allocation routines for host memory type +class USMHostMemoryProvider final : public USMMemoryProvider { +public: + const char *get_name() override { return "USMSharedMemoryProvider"; } + +protected: + ur_result_t allocateImpl(void **ResultPtr, size_t Size, + uint32_t Alignment) override; +}; + +ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, + ur_device_handle_t Device, + ur_usm_device_mem_flags_t *Flags, size_t Size, + uint32_t Alignment); + +ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context, + ur_device_handle_t Device, + ur_usm_host_mem_flags_t *, + ur_usm_device_mem_flags_t *, size_t Size, + uint32_t Alignment); + +ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context, + ur_usm_host_mem_flags_t *Flags, size_t Size, + uint32_t Alignment); diff --git a/source/adapters/cuda/usm_p2p.cpp b/source/adapters/cuda/usm_p2p.cpp new file mode 100644 index 0000000000..ed580dd5d8 --- /dev/null +++ b/source/adapters/cuda/usm_p2p.cpp @@ -0,0 +1,69 @@ +//===--------- usm_p2p.cpp - CUDA Adapter----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "common.hpp" +#include "context.hpp" + +UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PEnablePeerAccessExp( + ur_device_handle_t commandDevice, ur_device_handle_t peerDevice) { + + ur_result_t result = UR_RESULT_SUCCESS; + try { + ScopedContext active(commandDevice->getContext()); + UR_CHECK_ERROR(cuCtxEnablePeerAccess(peerDevice->getContext(), 0)); + } catch (ur_result_t err) { + result = err; + } + return result; +} + +UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PDisablePeerAccessExp( + ur_device_handle_t commandDevice, ur_device_handle_t peerDevice) { + + ur_result_t result = UR_RESULT_SUCCESS; + try { + ScopedContext active(commandDevice->getContext()); + UR_CHECK_ERROR(cuCtxDisablePeerAccess(peerDevice->getContext())); + } catch (ur_result_t err) { + result = err; + } + return result; +} + +UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( + ur_device_handle_t commandDevice, ur_device_handle_t peerDevice, + ur_exp_peer_info_t propName, size_t propSize, void *pPropValue, + size_t *pPropSizeRet) { + + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + int value; + CUdevice_P2PAttribute cu_attr; + try { + ScopedContext active(commandDevice->getContext()); + switch (propName) { + case UR_EXP_PEER_INFO_UR_PEER_ACCESS_SUPPORTED: { + cu_attr = CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED; + break; + } + case UR_EXP_PEER_INFO_UR_PEER_ATOMICS_SUPPORTED: { + cu_attr = CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED; + break; + } + default: { + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + + UR_CHECK_ERROR(cuDeviceGetP2PAttribute( + &value, cu_attr, commandDevice->get(), peerDevice->get())); + } catch (ur_result_t err) { + return err; + } + return ReturnValue(value); +} diff --git a/source/adapters/level_zero/ur_interface_loader.cpp b/source/adapters/level_zero/ur_interface_loader.cpp index 23182a1af2..6b3116d87c 100644 --- a/source/adapters/level_zero/ur_interface_loader.cpp +++ b/source/adapters/level_zero/ur_interface_loader.cpp @@ -390,3 +390,41 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetUSMExpProcAddrTable( pDdiTable->pfnPitchedAllocExp = urUSMPitchedAllocExp; return UR_RESULT_SUCCESS; } + +UR_DLLEXPORT ur_result_t UR_APICALL urGetVirtualMemProcAddrTable( + ur_api_version_t version, ///< [in] API version requested + ur_virtual_mem_dditable_t + *pDdiTable ///< [in,out] pointer to table of DDI function pointers +) { + auto retVal = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != retVal) { + return retVal; + } + + pDdiTable->pfnFree = nullptr; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGranularityGetInfo = nullptr; + pDdiTable->pfnMap = nullptr; + pDdiTable->pfnReserve = nullptr; + pDdiTable->pfnSetAccess = nullptr; + pDdiTable->pfnUnmap = nullptr; + + return retVal; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetPhysicalMemProcAddrTable( + ur_api_version_t version, ///< [in] API version requested + ur_physical_mem_dditable_t + *pDdiTable ///< [in,out] pointer to table of DDI function pointers +) { + auto retVal = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != retVal) { + return retVal; + } + + pDdiTable->pfnCreate = nullptr; + pDdiTable->pfnRelease = nullptr; + pDdiTable->pfnRetain = nullptr; + + return retVal; +} diff --git a/test/conformance/adapters/cuda/CMakeLists.txt b/test/conformance/adapters/cuda/CMakeLists.txt index ca7b0ea7e4..8b060686cf 100644 --- a/test/conformance/adapters/cuda/CMakeLists.txt +++ b/test/conformance/adapters/cuda/CMakeLists.txt @@ -16,7 +16,7 @@ add_conformance_test_with_devices_environment(adapter-cuda memory_tests.cpp ) target_link_libraries(test-adapter-cuda PRIVATE cudadrv ur_adapter_cuda) -target_include_directories(test-adapter-cuda PRIVATE ${CUDA_DIR} "${CUDA_DIR}/../../../" ) +target_include_directories(test-adapter-cuda PRIVATE ${CUDA_DIR} "${CUDA_DIR}/../../" ) set_tests_properties(adapter-cuda PROPERTIES LABELS "conformance:cuda" diff --git a/test/conformance/event/event_adapter_cuda.match b/test/conformance/event/event_adapter_cuda.match index c97aeab323..19f3ddeba0 100644 --- a/test/conformance/event/event_adapter_cuda.match +++ b/test/conformance/event/event_adapter_cuda.match @@ -1 +1,5 @@ -{{Segmentation fault|Aborted}} +urEventSetCallbackTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}_ +urEventSetCallbackTest.ValidateParameters/NVIDIA_CUDA_BACKEND___{{.*}}_ +urEventSetCallbackTest.AllStates/NVIDIA_CUDA_BACKEND___{{.*}}_ +urEventSetCallbackTest.EventAlreadyCompleted/NVIDIA_CUDA_BACKEND___{{.*}}_ +urEventSetCallbackNegativeTest.InvalidNullHandle/NVIDIA_CUDA_BACKEND___{{.*}}_ diff --git a/test/conformance/platform/platform_adapter_cuda.match b/test/conformance/platform/platform_adapter_cuda.match index 052cc6555c..e69de29bb2 100644 --- a/test/conformance/platform/platform_adapter_cuda.match +++ b/test/conformance/platform/platform_adapter_cuda.match @@ -1,3 +0,0 @@ -urPlatformGetNativeHandleTest.Success -urPlatformGetNativeHandleTest.InvalidNullHandlePlatform -urPlatformGetNativeHandleTest.InvalidNullPointerNativePlatform diff --git a/test/conformance/sampler/sampler_adapter_cuda.match b/test/conformance/sampler/sampler_adapter_cuda.match index 67eb2fec0b..e69de29bb2 100644 --- a/test/conformance/sampler/sampler_adapter_cuda.match +++ b/test/conformance/sampler/sampler_adapter_cuda.match @@ -1,3 +0,0 @@ -urSamplerGetNativeHandleTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}_ -urSamplerGetNativeHandleTest.InvalidNullHandleSampler/NVIDIA_CUDA_BACKEND___{{.*}}_ -urSamplerGetNativeHandleTest.InvalidNullPointerNativeHandle/NVIDIA_CUDA_BACKEND___{{.*}}_