From 18b21f726679e8457248d2b7e169ca901f27906a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Mestre?= Date: Thu, 28 Sep 2023 14:30:44 +0100 Subject: [PATCH 1/2] [SYCL][CUDA] Fetch the adapter source from UR repo The CUDA adapter source files have been moved to the unified runtime repository at https://github.com/oneapi-src/unified-runtime This commit removes the sources files from intel/llvm and updates cmake to fetch them directly from the unified runtime repository. --- sycl/plugins/cuda/CMakeLists.txt | 70 +- sycl/plugins/cuda/pi_cuda.hpp | 20 +- sycl/plugins/unified_runtime/CMakeLists.txt | 66 +- .../ur/adapters/cuda/README.md | 7 + .../ur/adapters/cuda/adapter.cpp | 89 - .../ur/adapters/cuda/adapter.hpp | 11 - .../ur/adapters/cuda/command_buffer.cpp | 253 --- .../ur/adapters/cuda/command_buffer.hpp | 13 - .../ur/adapters/cuda/common.cpp | 139 -- .../ur/adapters/cuda/common.hpp | 59 - .../ur/adapters/cuda/context.cpp | 161 -- .../ur/adapters/cuda/context.hpp | 149 -- .../ur/adapters/cuda/device.cpp | 1212 ------------ .../ur/adapters/cuda/device.hpp | 119 -- .../ur/adapters/cuda/enqueue.cpp | 1690 ----------------- .../ur/adapters/cuda/enqueue.hpp | 16 - .../ur/adapters/cuda/event.cpp | 295 --- .../ur/adapters/cuda/event.hpp | 189 -- .../ur/adapters/cuda/image.cpp | 1061 ----------- .../ur/adapters/cuda/image.hpp | 32 - .../ur/adapters/cuda/kernel.cpp | 374 ---- .../ur/adapters/cuda/kernel.hpp | 206 -- .../ur/adapters/cuda/memory.cpp | 479 ----- .../ur/adapters/cuda/memory.hpp | 232 --- .../ur/adapters/cuda/platform.cpp | 195 -- .../ur/adapters/cuda/platform.hpp | 15 - .../ur/adapters/cuda/program.cpp | 452 ----- .../ur/adapters/cuda/program.hpp | 54 - .../ur/adapters/cuda/queue.cpp | 328 ---- .../ur/adapters/cuda/queue.hpp | 246 --- .../ur/adapters/cuda/sampler.cpp | 106 -- .../ur/adapters/cuda/sampler.hpp | 54 - .../ur/adapters/cuda/tracing.cpp | 109 -- .../ur/adapters/cuda/ur_interface_loader.cpp | 355 ---- .../unified_runtime/ur/adapters/cuda/usm.cpp | 503 ----- .../unified_runtime/ur/adapters/cuda/usm.hpp | 130 -- .../ur/adapters/cuda/usm_p2p.cpp | 69 - 37 files changed, 35 insertions(+), 9523 deletions(-) create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/README.md delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/adapter.cpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/adapter.hpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.hpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.hpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/image.cpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/image.hpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/platform.hpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/program.hpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.hpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/tracing.cpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/usm.hpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/usm_p2p.cpp diff --git a/sycl/plugins/cuda/CMakeLists.txt b/sycl/plugins/cuda/CMakeLists.txt index 80d8d0c2f5525..700130e2f33fe 100644 --- a/sycl/plugins/cuda/CMakeLists.txt +++ b/sycl/plugins/cuda/CMakeLists.txt @@ -1,29 +1,11 @@ message(STATUS "Including the PI API CUDA backend.") - # cannot rely on cmake support for CUDA; it assumes runtime API is being used. - # we only require the CUDA driver API to be used - # CUDA_CUDA_LIBRARY variable defines the path to libcuda.so, the CUDA Driver API library. - -find_package(CUDA 10.1 REQUIRED) - -# Make imported library global to use it within the project. -add_library(cudadrv SHARED IMPORTED GLOBAL) - -if (WIN32) - set_target_properties( - cudadrv PROPERTIES - IMPORTED_IMPLIB ${CUDA_CUDA_LIBRARY} - INTERFACE_INCLUDE_DIRECTORIES ${CUDA_INCLUDE_DIRS} - ) -else() - set_target_properties( - cudadrv PROPERTIES - IMPORTED_LOCATION ${CUDA_CUDA_LIBRARY} - INTERFACE_INCLUDE_DIRECTORIES ${CUDA_INCLUDE_DIRS} - ) -endif() - if (SYCL_ENABLE_XPTI_TRACING) + # cannot rely on cmake support for CUDA; it assumes runtime API is being used. + # we only require the CUDA driver API to be used + # CUDA_CUDA_LIBRARY variable defines the path to libcuda.so, the CUDA Driver API library. + find_package(CUDA 10.1 REQUIRED) + # The following two if's can be removed when FindCUDA -> FindCUDAToolkit. # CUDA_CUPTI_INCLUDE_DIR -> CUDAToolkit_CUPTI_INCLUDE_DIR include(FindCUDACupti) @@ -46,46 +28,15 @@ if (SYCL_ENABLE_XPTI_TRACING) ) endif() +# Get the CUDA adapter sources so they can be shared with the CUDA PI plugin +get_target_property(UR_CUDA_ADAPTER_SOURCES ur_adapter_cuda SOURCES) + add_sycl_plugin(cuda SOURCES + ${UR_CUDA_ADAPTER_SOURCES} # Some code is shared with the UR adapter "../unified_runtime/pi2ur.hpp" "../unified_runtime/pi2ur.cpp" - "../unified_runtime/ur/ur.hpp" - "../unified_runtime/ur/ur.cpp" - "../unified_runtime/ur/adapters/cuda/adapter.cpp" - "../unified_runtime/ur/adapters/cuda/adapter.hpp" - "../unified_runtime/ur/adapters/cuda/command_buffer.cpp" - "../unified_runtime/ur/adapters/cuda/command_buffer.hpp" - "../unified_runtime/ur/adapters/cuda/common.cpp" - "../unified_runtime/ur/adapters/cuda/common.hpp" - "../unified_runtime/ur/adapters/cuda/context.cpp" - "../unified_runtime/ur/adapters/cuda/context.hpp" - "../unified_runtime/ur/adapters/cuda/device.cpp" - "../unified_runtime/ur/adapters/cuda/device.hpp" - "../unified_runtime/ur/adapters/cuda/enqueue.cpp" - "../unified_runtime/ur/adapters/cuda/event.cpp" - "../unified_runtime/ur/adapters/cuda/event.hpp" - "../unified_runtime/ur/adapters/cuda/image.cpp" - "../unified_runtime/ur/adapters/cuda/image.hpp" - "../unified_runtime/ur/adapters/cuda/kernel.cpp" - "../unified_runtime/ur/adapters/cuda/kernel.hpp" - "../unified_runtime/ur/adapters/cuda/memory.cpp" - "../unified_runtime/ur/adapters/cuda/memory.hpp" - "../unified_runtime/ur/adapters/cuda/platform.cpp" - "../unified_runtime/ur/adapters/cuda/platform.hpp" - "../unified_runtime/ur/adapters/cuda/program.cpp" - "../unified_runtime/ur/adapters/cuda/program.hpp" - "../unified_runtime/ur/adapters/cuda/queue.cpp" - "../unified_runtime/ur/adapters/cuda/queue.hpp" - "../unified_runtime/ur/adapters/cuda/sampler.cpp" - "../unified_runtime/ur/adapters/cuda/sampler.hpp" - "../unified_runtime/ur/adapters/cuda/tracing.cpp" - "../unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp" - "../unified_runtime/ur/adapters/cuda/usm.cpp" - "../unified_runtime/ur/adapters/cuda/usm.hpp" - "../unified_runtime/ur/adapters/cuda/usm_p2p.cpp" - # --- "${sycl_inc_dir}/sycl/detail/pi.h" "${sycl_inc_dir}/sycl/detail/pi.hpp" "pi_cuda.hpp" @@ -94,7 +45,8 @@ add_sycl_plugin(cuda INCLUDE_DIRS ${sycl_inc_dir} ${XPTI_INCLUDE} - ${CMAKE_CURRENT_SOURCE_DIR}/../unified_runtime + ${CMAKE_CURRENT_SOURCE_DIR}/../unified_runtime # for Unified Runtime + ${UNIFIED_RUNTIME_SOURCE_DIR}/source/ # for adapters/cuda LIBRARIES cudadrv ${XPTI_LIBS} diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index b65c867c71a03..2b5d77b26ea9d 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -25,16 +25,16 @@ #define _PI_CUDA_PLUGIN_VERSION_STRING \ _PI_PLUGIN_VERSION_STRING(_PI_CUDA_PLUGIN_VERSION) -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // Share code between the PI Plugin and UR Adapter #include diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index c5bbb404c56f9..df97b6eb07812 100644 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -3,11 +3,15 @@ if (NOT DEFINED UNIFIED_RUNTIME_LIBRARY OR NOT DEFINED UNIFIED_RUNTIME_INCLUDE_DIR) include(FetchContent) - # The UR tag should be from the 'adapters' branch set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") - set(UNIFIED_RUNTIME_TAG c791b8bba63af1c1880ae278e9d6df90021636dd) + set(UNIFIED_RUNTIME_TAG 00c7edb98f0c57ad968196a9cef393c380b6d6f7) set(UR_BUILD_ADAPTER_L0 ON) + + if ("cuda" IN_LIST SYCL_ENABLE_PLUGINS) + set(UR_BUILD_ADAPTER_CUDA ON) + endif() + set(UMF_ENABLE_POOL_TRACKING ON) message(STATUS "Will fetch Unified Runtime from ${UNIFIED_RUNTIME_REPO}") FetchContent_Declare(unified-runtime @@ -81,63 +85,7 @@ add_sycl_plugin(unified_runtime add_dependencies(sycl-runtime-libraries ur_adapter_level_zero) if ("cuda" IN_LIST SYCL_ENABLE_PLUGINS) - # Build CUDA adapter - add_sycl_library("ur_adapter_cuda" SHARED - SOURCES - "ur/ur.hpp" - "ur/ur.cpp" - "ur/adapters/cuda/adapter.cpp" - "ur/adapters/cuda/adapter.hpp" - "ur/adapters/cuda/command_buffer.cpp" - "ur/adapters/cuda/command_buffer.hpp" - "ur/adapters/cuda/common.cpp" - "ur/adapters/cuda/common.hpp" - "ur/adapters/cuda/context.cpp" - "ur/adapters/cuda/context.hpp" - "ur/adapters/cuda/device.cpp" - "ur/adapters/cuda/device.hpp" - "ur/adapters/cuda/enqueue.cpp" - "ur/adapters/cuda/event.cpp" - "ur/adapters/cuda/event.hpp" - "ur/adapters/cuda/image.cpp" - "ur/adapters/cuda/image.hpp" - "ur/adapters/cuda/kernel.cpp" - "ur/adapters/cuda/kernel.hpp" - "ur/adapters/cuda/memory.cpp" - "ur/adapters/cuda/memory.hpp" - "ur/adapters/cuda/platform.cpp" - "ur/adapters/cuda/platform.hpp" - "ur/adapters/cuda/program.cpp" - "ur/adapters/cuda/program.hpp" - "ur/adapters/cuda/queue.cpp" - "ur/adapters/cuda/queue.hpp" - "ur/adapters/cuda/sampler.cpp" - "ur/adapters/cuda/sampler.hpp" - "ur/adapters/cuda/tracing.cpp" - "ur/adapters/cuda/ur_interface_loader.cpp" - "ur/adapters/cuda/usm.cpp" - "ur/adapters/cuda/usm.hpp" - "ur/adapters/cuda/usm_p2p.cpp" - INCLUDE_DIRS - ${sycl_inc_dir} - LIBRARIES - UnifiedRuntime-Headers - UnifiedRuntimeCommon - Threads::Threads - cudadrv - ) - - set_target_properties("ur_adapter_cuda" PROPERTIES - VERSION "0.0.0" - SOVERSION "0" - ) - - if(UMF_ENABLE_POOL_TRACKING) - target_compile_definitions("ur_adapter_cuda" PRIVATE - UMF_ENABLE_POOL_TRACKING) - else() - message(WARNING "CUDA adapter USM pools are disabled, set UMF_ENABLE_POOL_TRACKING to enable them") - endif() + add_dependencies(sycl-runtime-libraries ur_adapter_cuda) endif() if ("hip" IN_LIST SYCL_ENABLE_PLUGINS) diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/README.md b/sycl/plugins/unified_runtime/ur/adapters/cuda/README.md new file mode 100644 index 0000000000000..3ce735f91aa4e --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/README.md @@ -0,0 +1,7 @@ +# Cuda adapter +The source for the Cuda adapter has been moved to the +[adapters](https://github.com/oneapi-src/unified-runtime/tree/adapters) branch +of the [Unified Runtime](https://github.com/oneapi-src/unified-runtime/) repo. +Changes can be made by opening pull requests against that branch, and updating +the Unified Runtime commit in the parent +[CMakeLists.txt](../../../CMakeLists.txt). diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/adapter.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/adapter.cpp deleted file mode 100644 index e1179f487d4fd..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/adapter.cpp +++ /dev/null @@ -1,89 +0,0 @@ -//===--------- adapter.cpp - CUDA Adapter ---------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include - -#include "common.hpp" - -void enableCUDATracing(); -void disableCUDATracing(); - -struct ur_adapter_handle_t_ { - std::atomic RefCount = 0; - std::mutex Mutex; -}; - -ur_adapter_handle_t_ adapter{}; - -UR_APIEXPORT ur_result_t UR_APICALL urInit(ur_device_init_flags_t, - ur_loader_config_handle_t) { - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urTearDown(void *) { - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL -urAdapterGet(uint32_t NumEntries, ur_adapter_handle_t *phAdapters, - uint32_t *pNumAdapters) { - if (NumEntries > 0 && phAdapters) { - std::lock_guard Lock{adapter.Mutex}; - if (adapter.RefCount++ == 0) { - enableCUDATracing(); - } - - *phAdapters = &adapter; - } - - if (pNumAdapters) { - *pNumAdapters = 1; - } - - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urAdapterRetain(ur_adapter_handle_t) { - adapter.RefCount++; - - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urAdapterRelease(ur_adapter_handle_t) { - std::lock_guard Lock{adapter.Mutex}; - if (--adapter.RefCount == 0) { - disableCUDATracing(); - } - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urAdapterGetLastError( - ur_adapter_handle_t, const char **ppMessage, int32_t *pError) { - std::ignore = pError; - *ppMessage = ErrorMessage; - return ErrorMessageCode; -} - -UR_APIEXPORT ur_result_t UR_APICALL urAdapterGetInfo(ur_adapter_handle_t, - ur_adapter_info_t propName, - size_t propSize, - void *pPropValue, - size_t *pPropSizeRet) { - UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); - - switch (propName) { - case UR_ADAPTER_INFO_BACKEND: - return ReturnValue(UR_ADAPTER_BACKEND_CUDA); - case UR_ADAPTER_INFO_REFERENCE_COUNT: - return ReturnValue(adapter.RefCount.load()); - default: - return UR_RESULT_ERROR_INVALID_ENUMERATION; - } - - return UR_RESULT_SUCCESS; -} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/adapter.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/adapter.hpp deleted file mode 100644 index 7edf36e636dba..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/adapter.hpp +++ /dev/null @@ -1,11 +0,0 @@ -//===--------- adapter.hpp - CUDA Adapter ---------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -struct ur_adapter_handle_t_; - -extern ur_adapter_handle_t_ adapter; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp deleted file mode 100644 index e2e1784d13e5b..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp +++ /dev/null @@ -1,253 +0,0 @@ -//===--------- command_buffer.cpp - CUDA Adapter --------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "command_buffer.hpp" -#include "common.hpp" - -/// Stub implementations of UR experimental feature command-buffers - -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( - ur_context_handle_t hContext, ur_device_handle_t hDevice, - const ur_exp_command_buffer_desc_t *pCommandBufferDesc, - ur_exp_command_buffer_handle_t *phCommandBuffer) { - (void)hContext; - (void)hDevice; - (void)pCommandBufferDesc; - (void)phCommandBuffer; - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -UR_APIEXPORT ur_result_t UR_APICALL -urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - (void)hCommandBuffer; - - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -UR_APIEXPORT ur_result_t UR_APICALL -urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - (void)hCommandBuffer; - - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -UR_APIEXPORT ur_result_t UR_APICALL -urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - (void)hCommandBuffer; - - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( - ur_exp_command_buffer_handle_t hCommandBuffer, ur_kernel_handle_t hKernel, - uint32_t workDim, const size_t *pGlobalWorkOffset, - const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, - uint32_t numSyncPointsInWaitList, - const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { - (void)hCommandBuffer; - (void)hKernel; - (void)workDim; - (void)pGlobalWorkOffset; - (void)pGlobalWorkSize; - (void)pLocalWorkSize; - (void)numSyncPointsInWaitList; - (void)pSyncPointWaitList; - (void)pSyncPoint; - - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemcpyUSMExp( - ur_exp_command_buffer_handle_t hCommandBuffer, void *pDst, const void *pSrc, - size_t size, uint32_t numSyncPointsInWaitList, - const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { - (void)hCommandBuffer; - (void)pDst; - (void)pSrc; - (void)size; - (void)numSyncPointsInWaitList; - (void)pSyncPointWaitList; - (void)pSyncPoint; - - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyExp( - ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hSrcMem, - ur_mem_handle_t hDstMem, size_t srcOffset, size_t dstOffset, size_t size, - uint32_t numSyncPointsInWaitList, - const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { - (void)hCommandBuffer; - (void)hSrcMem; - (void)hDstMem; - (void)srcOffset; - (void)dstOffset; - (void)size; - (void)numSyncPointsInWaitList; - (void)pSyncPointWaitList; - (void)pSyncPoint; - - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyRectExp( - ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hSrcMem, - ur_mem_handle_t hDstMem, ur_rect_offset_t srcOrigin, - ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, - size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, - uint32_t numSyncPointsInWaitList, - const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { - (void)hCommandBuffer; - (void)hSrcMem; - (void)hDstMem; - (void)srcOrigin; - (void)dstOrigin; - (void)region; - (void)srcRowPitch; - (void)srcSlicePitch; - (void)dstRowPitch; - (void)dstSlicePitch; - (void)numSyncPointsInWaitList; - (void)pSyncPointWaitList; - (void)pSyncPoint; - - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -UR_APIEXPORT -ur_result_t UR_APICALL urCommandBufferAppendMembufferWriteExp( - ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, - size_t offset, size_t size, const void *pSrc, - uint32_t numSyncPointsInWaitList, - const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { - (void)hCommandBuffer; - (void)hBuffer; - (void)offset; - (void)size; - (void)pSrc; - (void)numSyncPointsInWaitList; - (void)pSyncPointWaitList; - (void)pSyncPoint; - - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -UR_APIEXPORT -ur_result_t UR_APICALL urCommandBufferAppendMembufferReadExp( - ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, - size_t offset, size_t size, void *pDst, uint32_t numSyncPointsInWaitList, - const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { - (void)hCommandBuffer; - (void)hBuffer; - (void)offset; - (void)size; - (void)pDst; - (void)numSyncPointsInWaitList; - (void)pSyncPointWaitList; - (void)pSyncPoint; - - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -UR_APIEXPORT -ur_result_t UR_APICALL urCommandBufferAppendMembufferWriteRectExp( - ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, - ur_rect_offset_t bufferOffset, ur_rect_offset_t hostOffset, - ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, - size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, - uint32_t numSyncPointsInWaitList, - const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { - (void)hCommandBuffer; - (void)hBuffer; - (void)bufferOffset; - (void)hostOffset; - (void)region; - (void)bufferRowPitch; - (void)bufferSlicePitch; - (void)hostRowPitch; - (void)hostSlicePitch; - (void)pSrc; - (void)numSyncPointsInWaitList; - (void)pSyncPointWaitList; - (void)pSyncPoint; - - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -UR_APIEXPORT -ur_result_t UR_APICALL urCommandBufferAppendMembufferReadRectExp( - ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, - ur_rect_offset_t bufferOffset, ur_rect_offset_t hostOffset, - ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, - size_t hostRowPitch, size_t hostSlicePitch, void *pDst, - uint32_t numSyncPointsInWaitList, - const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint) { - (void)hCommandBuffer; - (void)hBuffer; - (void)bufferOffset; - (void)hostOffset; - (void)region; - (void)bufferRowPitch; - (void)bufferSlicePitch; - (void)hostRowPitch; - (void)hostSlicePitch; - (void)pDst; - - (void)numSyncPointsInWaitList; - (void)pSyncPointWaitList; - (void)pSyncPoint; - - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( - ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - (void)hCommandBuffer; - (void)hQueue; - (void)numEventsInWaitList; - (void)phEventWaitList; - (void)phEvent; - - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.hpp deleted file mode 100644 index 31ea4372ea2b1..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.hpp +++ /dev/null @@ -1,13 +0,0 @@ -//===--------- command_buffer.hpp - CUDA Adapter --------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include - -/// Stub implementation of command-buffers for CUDA - -struct ur_exp_command_buffer_handle_t_ {}; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp deleted file mode 100644 index 5fcfe5993eee3..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp +++ /dev/null @@ -1,139 +0,0 @@ -//===--------- common.cpp - CUDA Adapter ----------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "common.hpp" - -#include - -#include - -ur_result_t mapErrorUR(CUresult Result) { - switch (Result) { - case CUDA_SUCCESS: - return UR_RESULT_SUCCESS; - case CUDA_ERROR_NOT_PERMITTED: - return UR_RESULT_ERROR_INVALID_OPERATION; - case CUDA_ERROR_INVALID_CONTEXT: - return UR_RESULT_ERROR_INVALID_CONTEXT; - case CUDA_ERROR_INVALID_DEVICE: - return UR_RESULT_ERROR_INVALID_DEVICE; - case CUDA_ERROR_INVALID_VALUE: - return UR_RESULT_ERROR_INVALID_VALUE; - case CUDA_ERROR_OUT_OF_MEMORY: - return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; - case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: - return UR_RESULT_ERROR_OUT_OF_RESOURCES; - default: - return UR_RESULT_ERROR_UNKNOWN; - } -} - -void checkErrorUR(CUresult Result, const char *Function, int Line, - const char *File) { - if (Result == CUDA_SUCCESS || Result == CUDA_ERROR_DEINITIALIZED) { - return; - } - - if (std::getenv("SYCL_PI_SUPPRESS_ERROR_MESSAGE") == nullptr && - std::getenv("UR_SUPPRESS_ERROR_MESSAGE") == nullptr) { - const char *ErrorString = nullptr; - const char *ErrorName = nullptr; - cuGetErrorName(Result, &ErrorName); - cuGetErrorString(Result, &ErrorString); - std::stringstream SS; - SS << "\nUR CUDA ERROR:" - << "\n\tValue: " << Result - << "\n\tName: " << ErrorName - << "\n\tDescription: " << ErrorString - << "\n\tFunction: " << Function << "\n\tSource Location: " << File - << ":" << Line << "\n" - << std::endl; - std::cerr << SS.str(); - } - - if (std::getenv("PI_CUDA_ABORT") != nullptr || - std::getenv("UR_CUDA_ABORT") != nullptr) { - std::abort(); - } - - throw mapErrorUR(Result); -} - -void checkErrorUR(ur_result_t Result, const char *Function, int Line, - const char *File) { - if (Result == UR_RESULT_SUCCESS) { - return; - } - - if (std::getenv("SYCL_PI_SUPPRESS_ERROR_MESSAGE") == nullptr && - std::getenv("UR_SUPPRESS_ERROR_MESSAGE") == nullptr) { - std::stringstream SS; - SS << "\nUR ERROR:" - << "\n\tValue: " << Result - << "\n\tFunction: " << Function << "\n\tSource Location: " << File - << ":" << Line << "\n" - << std::endl; - std::cerr << SS.str(); - } - - if (std::getenv("PI_CUDA_ABORT") != nullptr) { - std::abort(); - } - - throw Result; -} - -std::string getCudaVersionString() { - int driver_version = 0; - cuDriverGetVersion(&driver_version); - // The version is returned as (1000 major + 10 minor). - std::stringstream stream; - stream << "CUDA " << driver_version / 1000 << "." - << driver_version % 1000 / 10; - return stream.str(); -} - -void detail::ur::die(const char *Message) { - std::cerr << "ur_die: " << Message << std::endl; - std::terminate(); -} - -void detail::ur::assertion(bool Condition, const char *Message) { - if (!Condition) - die(Message); -} - -void detail::ur::cuPrint(const char *Message) { - std::cerr << "ur_print: " << Message << std::endl; -} - -// Global variables for ZER_EXT_RESULT_ADAPTER_SPECIFIC_ERROR -thread_local ur_result_t ErrorMessageCode = UR_RESULT_SUCCESS; -thread_local char ErrorMessage[MaxMessageSize]; - -// Utility function for setting a message and warning -[[maybe_unused]] void setErrorMessage(const char *pMessage, - ur_result_t ErrorCode) { - assert(strlen(pMessage) <= MaxMessageSize); - strcpy(ErrorMessage, pMessage); - ErrorMessageCode = ErrorCode; -} - -void setPluginSpecificMessage(CUresult cu_res) { - const char *error_string; - const char *error_name; - cuGetErrorName(cu_res, &error_name); - cuGetErrorString(cu_res, &error_string); - char *message = (char *)malloc(strlen(error_string) + strlen(error_name) + 2); - strcpy(message, error_name); - strcat(message, "\n"); - strcat(message, error_string); - - setErrorMessage(message, UR_RESULT_ERROR_ADAPTER_SPECIFIC); - free(message); -} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp deleted file mode 100644 index 1f73a7030e6e5..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp +++ /dev/null @@ -1,59 +0,0 @@ -//===--------- common.hpp - CUDA Adapter ----------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#pragma once - -#include -#include - -ur_result_t mapErrorUR(CUresult Result); - -/// Converts CUDA error into UR error codes, and outputs error information -/// to stderr. -/// If PI_CUDA_ABORT env variable is defined, it aborts directly instead of -/// throwing the error. This is intended for debugging purposes. -/// \return UR_RESULT_SUCCESS if \param Result was CUDA_SUCCESS. -/// \throw ur_result_t exception (integer) if input was not success. -/// -void checkErrorUR(CUresult Result, const char *Function, int Line, - const char *File); - -void checkErrorUR(ur_result_t Result, const char *Function, int Line, - const char *File); - -#define UR_CHECK_ERROR(Result) \ - checkErrorUR(Result, __func__, __LINE__, __FILE__) - -std::string getCudaVersionString(); - -constexpr size_t MaxMessageSize = 256; -extern thread_local ur_result_t ErrorMessageCode; -extern thread_local char ErrorMessage[MaxMessageSize]; - -// Utility function for setting a message and warning -[[maybe_unused]] void setErrorMessage(const char *pMessage, - ur_result_t ErrorCode); - -void setPluginSpecificMessage(CUresult cu_res); - -/// ------ Error handling, matching OpenCL plugin semantics. -namespace detail { -namespace ur { - -// Report error and no return (keeps compiler from printing warnings). -// TODO: Probably change that to throw a catchable exception, -// but for now it is useful to see every failure. -// -[[noreturn]] void die(const char *Message); - -// Reports error messages -void cuPrint(const char *Message); - -void assertion(bool Condition, const char *Message = nullptr); - -} // namespace ur -} // namespace detail diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp deleted file mode 100644 index 179902a538831..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp +++ /dev/null @@ -1,161 +0,0 @@ -//===--------- context.cpp - CUDA Adapter ---------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "context.hpp" -#include "usm.hpp" - -#include - -void ur_context_handle_t_::addPool(ur_usm_pool_handle_t Pool) { - std::lock_guard Lock(Mutex); - PoolHandles.insert(Pool); -} - -void ur_context_handle_t_::removePool(ur_usm_pool_handle_t Pool) { - std::lock_guard Lock(Mutex); - PoolHandles.erase(Pool); -} - -ur_usm_pool_handle_t -ur_context_handle_t_::getOwningURPool(umf_memory_pool_t *UMFPool) { - std::lock_guard Lock(Mutex); - for (auto &Pool : PoolHandles) { - if (Pool->hasUMFPool(UMFPool)) { - return Pool; - } - } - return nullptr; -} - -/// Create a UR CUDA context. -/// -/// By default creates a scoped context and keeps the last active CUDA context -/// on top of the CUDA context stack. -/// With the __SYCL_PI_CONTEXT_PROPERTIES_CUDA_PRIMARY key/id and a value of -/// PI_TRUE creates a primary CUDA context and activates it on the CUDA context -/// stack. -/// -UR_APIEXPORT ur_result_t UR_APICALL -urContextCreate(uint32_t DeviceCount, const ur_device_handle_t *phDevices, - const ur_context_properties_t *pProperties, - ur_context_handle_t *phContext) { - std::ignore = DeviceCount; - std::ignore = pProperties; - - assert(DeviceCount == 1); - ur_result_t RetErr = UR_RESULT_SUCCESS; - - std::unique_ptr ContextPtr{nullptr}; - try { - ContextPtr = std::unique_ptr( - new ur_context_handle_t_{*phDevices}); - *phContext = ContextPtr.release(); - } catch (ur_result_t Err) { - RetErr = Err; - } catch (...) { - RetErr = UR_RESULT_ERROR_OUT_OF_RESOURCES; - } - return RetErr; -} - -UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo( - ur_context_handle_t hContext, ur_context_info_t ContextInfoType, - size_t propSize, void *pContextInfo, size_t *pPropSizeRet) { - UrReturnHelper ReturnValue(propSize, pContextInfo, pPropSizeRet); - - switch (static_cast(ContextInfoType)) { - case UR_CONTEXT_INFO_NUM_DEVICES: - return ReturnValue(1); - case UR_CONTEXT_INFO_DEVICES: - return ReturnValue(hContext->getDevice()); - case UR_CONTEXT_INFO_REFERENCE_COUNT: - return ReturnValue(hContext->getReferenceCount()); - case UR_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: { - uint32_t Capabilities = UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | - UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE | - UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE | - UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL; - return ReturnValue(Capabilities); - } - case UR_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: { - int Major = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - hContext->getDevice()->get())); - uint32_t Capabilities = - (Major >= 7) ? UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | - UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | - UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP | - UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE | - UR_MEMORY_SCOPE_CAPABILITY_FLAG_SYSTEM - : UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | - UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | - UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP | - UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE; - return ReturnValue(Capabilities); - } - case UR_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT: - // 2D USM memcpy is supported. - return ReturnValue(true); - case UR_CONTEXT_INFO_USM_FILL2D_SUPPORT: - // 2D USM operations currently not supported. - return ReturnValue(false); - - default: - break; - } - - return UR_RESULT_ERROR_INVALID_ENUMERATION; -} - -UR_APIEXPORT ur_result_t UR_APICALL -urContextRelease(ur_context_handle_t hContext) { - if (hContext->decrementReferenceCount() > 0) { - return UR_RESULT_SUCCESS; - } - hContext->invokeExtendedDeleters(); - - std::unique_ptr Context{hContext}; - - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL -urContextRetain(ur_context_handle_t hContext) { - assert(hContext->getReferenceCount() > 0); - - hContext->incrementReferenceCount(); - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urContextGetNativeHandle( - ur_context_handle_t hContext, ur_native_handle_t *phNativeContext) { - *phNativeContext = reinterpret_cast(hContext->get()); - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urContextCreateWithNativeHandle( - ur_native_handle_t hNativeContext, uint32_t numDevices, - const ur_device_handle_t *phDevices, - const ur_context_native_properties_t *pProperties, - ur_context_handle_t *phContext) { - std::ignore = hNativeContext; - std::ignore = numDevices; - std::ignore = phDevices; - std::ignore = pProperties; - std::ignore = phContext; - - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -UR_APIEXPORT ur_result_t UR_APICALL urContextSetExtendedDeleter( - ur_context_handle_t hContext, ur_context_extended_deleter_t pfnDeleter, - void *pUserData) { - hContext->setExtendedDeleter(pfnDeleter, pUserData); - return UR_RESULT_SUCCESS; -} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp deleted file mode 100644 index a321c148940b2..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp +++ /dev/null @@ -1,149 +0,0 @@ -//===--------- context.hpp - CUDA Adapter ---------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#pragma once - -#include -#include - -#include -#include -#include -#include - -#include "common.hpp" -#include "device.hpp" - -#include - -typedef void (*ur_context_extended_deleter_t)(void *user_data); - -/// UR context mapping to a CUDA context object. -/// -/// There is no direct mapping between a CUDA context and a UR context. -/// The main differences are described below: -/// -/// CUDA context vs UR context -/// -/// One of the main differences between the UR API and the CUDA driver API is -/// that the second modifies the state of the threads by assigning -/// `CUcontext` objects to threads. `CUcontext` objects store data associated -/// with a given device and control access to said device from the user side. -/// UR API context are objects that are passed to functions, and not bound -/// to threads. -/// The ur_context_handle_t_ object doesn't implement this behavior. It only -/// holds the CUDA context data. The RAII object \ref ScopedContext implements -/// the active context behavior. -/// -/// Primary vs User-defined context -/// -/// CUDA has two different types of context, the Primary context, -/// which is usable by all threads on a given process for a given device, and -/// the aforementioned custom contexts. -/// The CUDA documentation, confirmed with performance analysis, suggest using -/// the Primary context whenever possible. -/// The Primary context is also used by the CUDA Runtime API. -/// For UR applications to interop with CUDA Runtime API, they have to use -/// the primary context - and make that active in the thread. -/// The `ur_context_handle_t_` object can be constructed with a `kind` parameter -/// that allows to construct a Primary or `user-defined` context, so that -/// the UR object interface is always the same. -/// -/// Destructor callback -/// -/// Required to implement CP023, SYCL Extended Context Destruction, -/// the PI Context can store a number of callback functions that will be -/// called upon destruction of the UR Context. -/// See proposal for details. -/// https://github.com/codeplaysoftware/standards-proposals/blob/master/extended-context-destruction/index.md -/// -struct ur_context_handle_t_ { - - struct deleter_data { - ur_context_extended_deleter_t Function; - void *UserData; - - void operator()() { Function(UserData); } - }; - - using native_type = CUcontext; - - native_type CUContext; - ur_device_handle_t DeviceID; - std::atomic_uint32_t RefCount; - - ur_context_handle_t_(ur_device_handle_t_ *DevID) - : CUContext{DevID->getContext()}, DeviceID{DevID}, RefCount{1} { - urDeviceRetain(DeviceID); - }; - - ~ur_context_handle_t_() { urDeviceRelease(DeviceID); } - - void invokeExtendedDeleters() { - std::lock_guard Guard(Mutex); - for (auto &Deleter : ExtendedDeleters) { - Deleter(); - } - } - - void setExtendedDeleter(ur_context_extended_deleter_t Function, - void *UserData) { - std::lock_guard Guard(Mutex); - ExtendedDeleters.emplace_back(deleter_data{Function, UserData}); - } - - ur_device_handle_t getDevice() const noexcept { return DeviceID; } - - native_type get() const noexcept { return CUContext; } - - uint32_t incrementReferenceCount() noexcept { return ++RefCount; } - - uint32_t decrementReferenceCount() noexcept { return --RefCount; } - - uint32_t getReferenceCount() const noexcept { return RefCount; } - - void addPool(ur_usm_pool_handle_t Pool); - - void removePool(ur_usm_pool_handle_t Pool); - - ur_usm_pool_handle_t getOwningURPool(umf_memory_pool_t *UMFPool); - -private: - std::mutex Mutex; - std::vector ExtendedDeleters; - std::set PoolHandles; -}; - -namespace { -class ScopedContext { -public: - ScopedContext(ur_context_handle_t Context) { - if (!Context) { - throw UR_RESULT_ERROR_INVALID_CONTEXT; - } - - setContext(Context->get()); - } - - ScopedContext(CUcontext NativeContext) { setContext(NativeContext); } - - ~ScopedContext() {} - -private: - void setContext(CUcontext Desired) { - CUcontext Original = nullptr; - - UR_CHECK_ERROR(cuCtxGetCurrent(&Original)); - - // Make sure the desired context is active on the current thread, setting - // it if necessary - if (Original != Desired) { - UR_CHECK_ERROR(cuCtxSetCurrent(Desired)); - } - } -}; -} // namespace diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp deleted file mode 100644 index ece3dca15a3b3..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp +++ /dev/null @@ -1,1212 +0,0 @@ -//===--------- device.cpp - CUDA Adapter ----------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include -#include -#include - -#include "adapter.hpp" -#include "context.hpp" -#include "device.hpp" -#include "platform.hpp" - -int getAttribute(ur_device_handle_t device, CUdevice_attribute attribute) { - int value; - - UR_CHECK_ERROR(cuDeviceGetAttribute(&value, attribute, device->get())); - return value; -} - -uint64_t ur_device_handle_t_::getElapsedTime(CUevent ev) const { - float Milliseconds = 0.0f; - - // cuEventSynchronize waits till the event is ready for call to - // cuEventElapsedTime. - UR_CHECK_ERROR(cuEventSynchronize(EvBase)); - UR_CHECK_ERROR(cuEventSynchronize(ev)); - UR_CHECK_ERROR(cuEventElapsedTime(&Milliseconds, EvBase, ev)); - - return static_cast(Milliseconds * 1.0e6); -} - -UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, - ur_device_info_t propName, - size_t propSize, - void *pPropValue, - size_t *pPropSizeRet) { - UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); - - static constexpr uint32_t MaxWorkItemDimensions = 3u; - - ScopedContext Active(hDevice->getContext()); - - switch ((uint32_t)propName) { - case UR_DEVICE_INFO_TYPE: { - return ReturnValue(UR_DEVICE_TYPE_GPU); - } - case UR_DEVICE_INFO_VENDOR_ID: { - return ReturnValue(4318u); - } - case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: { - int ComputeUnits = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &ComputeUnits, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, - hDevice->get())); - detail::ur::assertion(ComputeUnits >= 0); - return ReturnValue(static_cast(ComputeUnits)); - } - case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: { - return ReturnValue(MaxWorkItemDimensions); - } - case UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES: { - struct { - size_t Sizes[MaxWorkItemDimensions]; - } ReturnSizes; - - int MaxX = 0, MaxY = 0, MaxZ = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &MaxX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, hDevice->get())); - detail::ur::assertion(MaxX >= 0); - - UR_CHECK_ERROR(cuDeviceGetAttribute( - &MaxY, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, hDevice->get())); - detail::ur::assertion(MaxY >= 0); - - UR_CHECK_ERROR(cuDeviceGetAttribute( - &MaxZ, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, hDevice->get())); - detail::ur::assertion(MaxZ >= 0); - - ReturnSizes.Sizes[0] = size_t(MaxX); - ReturnSizes.Sizes[1] = size_t(MaxY); - ReturnSizes.Sizes[2] = size_t(MaxZ); - return ReturnValue(ReturnSizes); - } - - case UR_DEVICE_INFO_MAX_WORK_GROUPS_3D: { - struct { - size_t Sizes[MaxWorkItemDimensions]; - } ReturnSizes; - int MaxX = 0, MaxY = 0, MaxZ = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &MaxX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, hDevice->get())); - detail::ur::assertion(MaxX >= 0); - - UR_CHECK_ERROR(cuDeviceGetAttribute( - &MaxY, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, hDevice->get())); - detail::ur::assertion(MaxY >= 0); - - UR_CHECK_ERROR(cuDeviceGetAttribute( - &MaxZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, hDevice->get())); - detail::ur::assertion(MaxZ >= 0); - - ReturnSizes.Sizes[0] = size_t(MaxX); - ReturnSizes.Sizes[1] = size_t(MaxY); - ReturnSizes.Sizes[2] = size_t(MaxZ); - return ReturnValue(ReturnSizes); - } - - case UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE: { - int MaxWorkGroupSize = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &MaxWorkGroupSize, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - hDevice->get())); - - detail::ur::assertion(MaxWorkGroupSize >= 0); - - return ReturnValue(size_t(MaxWorkGroupSize)); - } - case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR: { - return ReturnValue(1u); - } - case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT: { - return ReturnValue(1u); - } - case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT: { - return ReturnValue(1u); - } - case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG: { - return ReturnValue(1u); - } - case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT: { - return ReturnValue(1u); - } - case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE: { - return ReturnValue(1u); - } - case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF: { - return ReturnValue(0u); - } - case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR: { - return ReturnValue(1u); - } - case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT: { - return ReturnValue(1u); - } - case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT: { - return ReturnValue(1u); - } - case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG: { - return ReturnValue(1u); - } - case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT: { - return ReturnValue(1u); - } - case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE: { - return ReturnValue(1u); - } - case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF: { - return ReturnValue(0u); - } - case UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS: { - // Number of sub-groups = max block size / warp size + possible remainder - int MaxThreads = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &MaxThreads, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - hDevice->get())); - int WarpSize = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, hDevice->get())); - int MaxWarps = (MaxThreads + WarpSize - 1) / WarpSize; - return ReturnValue(MaxWarps); - } - case UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: { - // Volta provides independent thread scheduling - // TODO: Revisit for previous generation GPUs - int Major = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice->get())); - bool IFP = (Major >= 7); - return ReturnValue(IFP); - } - - case UR_DEVICE_INFO_ATOMIC_64: { - int Major = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice->get())); - - bool Atomic64 = (Major >= 6) ? true : false; - return ReturnValue(Atomic64); - } - case UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: { - ur_memory_order_capability_flags_t Capabilities = - UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | - UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE | - UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE | - UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL; - return ReturnValue(Capabilities); - } - case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: { - int Major = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice->get())); - uint64_t Capabilities = - (Major >= 7) ? UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | - UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | - UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP | - UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE | - UR_MEMORY_SCOPE_CAPABILITY_FLAG_SYSTEM - : UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | - UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | - UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP | - UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE; - return ReturnValue(Capabilities); - } - - case UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: { - // SYCL2020 4.6.4.2 minimum mandated capabilities for - // atomic_fence_order_capabilities. - ur_memory_order_capability_flags_t Capabilities = - UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | - UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE | - UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE | - UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL; - return ReturnValue(Capabilities); - } - case UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: { - // SYCL2020 4.6.4.2 minimum mandated capabilities for - // atomic_fence/memory_scope_capabilities. - // Because scopes are hierarchical, wider scopes support all narrower - // scopes. At a minimum, each device must support WORK_ITEM, SUB_GROUP and - // WORK_GROUP. (https://github.com/KhronosGroup/SYCL-Docs/pull/382) - ur_memory_scope_capability_flags_t Capabilities = - UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | - UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | - UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP; - return ReturnValue(Capabilities); - } - case UR_DEVICE_INFO_BFLOAT16: { - int Major = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice->get())); - - bool BFloat16 = (Major >= 8) ? true : false; - return ReturnValue(BFloat16); - } - case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: { - // NVIDIA devices only support one sub-group size (the warp size) - int WarpSize = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, hDevice->get())); - size_t Sizes[1] = {static_cast(WarpSize)}; - return ReturnValue(Sizes, 1); - } - case UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY: { - int ClockFreq = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &ClockFreq, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, hDevice->get())); - detail::ur::assertion(ClockFreq >= 0); - return ReturnValue(static_cast(ClockFreq) / 1000u); - } - case UR_DEVICE_INFO_ADDRESS_BITS: { - auto Bits = uint32_t{std::numeric_limits::digits}; - return ReturnValue(Bits); - } - case UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: { - return ReturnValue(uint64_t{hDevice->getMaxAllocSize()}); - } - case UR_DEVICE_INFO_IMAGE_SUPPORTED: { - bool Enabled = false; - - if (std::getenv("SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT") != nullptr || - std::getenv("UR_CUDA_ENABLE_IMAGE_SUPPORT") != nullptr) { - Enabled = true; - } else { - detail::ur::cuPrint( - "Images are not fully supported by the CUDA BE, their support is " - "disabled by default. Their partial support can be activated by " - "setting SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT environment variable at " - "runtime."); - } - - return ReturnValue(Enabled); - } - case UR_DEVICE_INFO_MAX_READ_IMAGE_ARGS: { - // This call doesn't match to CUDA as it doesn't have images, but instead - // surfaces and textures. No clear call in the CUDA API to determine this, - // but some searching found as of SM 2.x 128 are supported. - return ReturnValue(128u); - } - case UR_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS: { - // This call doesn't match to CUDA as it doesn't have images, but instead - // surfaces and textures. No clear call in the CUDA API to determine this, - // but some searching found as of SM 2.x 128 are supported. - return ReturnValue(128u); - } - case UR_DEVICE_INFO_IMAGE2D_MAX_HEIGHT: { - // Take the smaller of maximum surface and maximum texture height. - int TexHeight = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &TexHeight, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, - hDevice->get())); - detail::ur::assertion(TexHeight >= 0); - int SurfHeight = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &SurfHeight, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT, - hDevice->get())); - detail::ur::assertion(SurfHeight >= 0); - - int Min = std::min(TexHeight, SurfHeight); - - return ReturnValue(static_cast(Min)); - } - case UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH: { - // Take the smaller of maximum surface and maximum texture width. - int TexWidth = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &TexWidth, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, - hDevice->get())); - detail::ur::assertion(TexWidth >= 0); - int SurfWidth = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &SurfWidth, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH, - hDevice->get())); - detail::ur::assertion(SurfWidth >= 0); - - int Min = std::min(TexWidth, SurfWidth); - - return ReturnValue(static_cast(Min)); - } - case UR_DEVICE_INFO_IMAGE3D_MAX_HEIGHT: { - // Take the smaller of maximum surface and maximum texture height. - int TexHeight = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &TexHeight, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, - hDevice->get())); - detail::ur::assertion(TexHeight >= 0); - int SurfHeight = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &SurfHeight, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT, - hDevice->get())); - detail::ur::assertion(SurfHeight >= 0); - - int Min = std::min(TexHeight, SurfHeight); - - return ReturnValue(static_cast(Min)); - } - case UR_DEVICE_INFO_IMAGE3D_MAX_WIDTH: { - // Take the smaller of maximum surface and maximum texture width. - int TexWidth = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &TexWidth, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, - hDevice->get())); - detail::ur::assertion(TexWidth >= 0); - int SurfWidth = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &SurfWidth, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH, - hDevice->get())); - detail::ur::assertion(SurfWidth >= 0); - - int Min = std::min(TexWidth, SurfWidth); - - return ReturnValue(static_cast(Min)); - } - case UR_DEVICE_INFO_IMAGE3D_MAX_DEPTH: { - // Take the smaller of maximum surface and maximum texture depth. - int TexDepth = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &TexDepth, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, - hDevice->get())); - detail::ur::assertion(TexDepth >= 0); - int SurfDepth = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &SurfDepth, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH, - hDevice->get())); - detail::ur::assertion(SurfDepth >= 0); - - int Min = std::min(TexDepth, SurfDepth); - - return ReturnValue(static_cast(Min)); - } - case UR_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE: { - // Take the smaller of maximum surface and maximum texture width. - int TexWidth = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &TexWidth, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, - hDevice->get())); - detail::ur::assertion(TexWidth >= 0); - int SurfWidth = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &SurfWidth, CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH, - hDevice->get())); - detail::ur::assertion(SurfWidth >= 0); - - int Min = std::min(TexWidth, SurfWidth); - - return ReturnValue(static_cast(Min)); - } - case UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE: { - return ReturnValue(0lu); - } - case UR_DEVICE_INFO_MAX_SAMPLERS: { - // This call is kind of meaningless for cuda, as samplers don't exist. - // Closest thing is textures, which is 128. - return ReturnValue(128u); - } - case UR_DEVICE_INFO_MAX_PARAMETER_SIZE: { - // https://docs.nvidia.com/cuda/cuda-c-programming-guide/#function-parameters - // __global__ function parameters are passed to the device via constant - // memory and are limited to 4 KB. - return ReturnValue(4000lu); - } - case UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: { - int MemBaseAddrAlign = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute(&MemBaseAddrAlign, - CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, - hDevice->get())); - // Multiply by 8 as clGetDeviceInfo returns this value in bits - MemBaseAddrAlign *= 8; - return ReturnValue(MemBaseAddrAlign); - } - case UR_DEVICE_INFO_HALF_FP_CONFIG: { - // TODO: is this config consistent across all NVIDIA GPUs? - return ReturnValue(0u); - } - case UR_DEVICE_INFO_SINGLE_FP_CONFIG: { - // TODO: is this config consistent across all NVIDIA GPUs? - ur_device_fp_capability_flags_t Config = - UR_DEVICE_FP_CAPABILITY_FLAG_DENORM | - UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN | - UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST | - UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO | - UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF | - UR_DEVICE_FP_CAPABILITY_FLAG_FMA | - UR_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT; - return ReturnValue(Config); - } - case UR_DEVICE_INFO_DOUBLE_FP_CONFIG: { - // TODO: is this config consistent across all NVIDIA GPUs? - ur_device_fp_capability_flags_t Config = - UR_DEVICE_FP_CAPABILITY_FLAG_DENORM | - UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN | - UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST | - UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO | - UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF | - UR_DEVICE_FP_CAPABILITY_FLAG_FMA; - return ReturnValue(Config); - } - case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE: { - // TODO: is this config consistent across all NVIDIA GPUs? - return ReturnValue(UR_DEVICE_MEM_CACHE_TYPE_READ_WRITE_CACHE); - } - case UR_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE: { - // The value is documented for all existing GPUs in the CUDA programming - // guidelines, section "H.3.2. Global Memory". - return ReturnValue(128u); - } - case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE: { - int CacheSize = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, hDevice->get())); - detail::ur::assertion(CacheSize >= 0); - // The L2 cache is global to the GPU. - return ReturnValue(static_cast(CacheSize)); - } - case UR_DEVICE_INFO_GLOBAL_MEM_SIZE: { - size_t Bytes = 0; - // Runtime API has easy access to this value, driver API info is scarse. - detail::ur::assertion(cuDeviceTotalMem(&Bytes, hDevice->get()) == - CUDA_SUCCESS); - return ReturnValue(uint64_t{Bytes}); - } - case UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE: { - int ConstantMemory = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &ConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, - hDevice->get())); - detail::ur::assertion(ConstantMemory >= 0); - - return ReturnValue(static_cast(ConstantMemory)); - } - case UR_DEVICE_INFO_MAX_CONSTANT_ARGS: { - // TODO: is there a way to retrieve this from CUDA driver API? - // Hard coded to value returned by clinfo for OpenCL 1.2 CUDA | GeForce GTX - // 1060 3GB - return ReturnValue(9u); - } - case UR_DEVICE_INFO_LOCAL_MEM_TYPE: { - return ReturnValue(UR_DEVICE_LOCAL_MEM_TYPE_LOCAL); - } - case UR_DEVICE_INFO_LOCAL_MEM_SIZE: { - // OpenCL's "local memory" maps most closely to CUDA's "shared memory". - // CUDA has its own definition of "local memory", which maps to OpenCL's - // "private memory". - if (hDevice->maxLocalMemSizeChosen()) { - return ReturnValue( - static_cast(hDevice->getMaxChosenLocalMem())); - } else { - int LocalMemSize = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &LocalMemSize, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, - hDevice->get())); - detail::ur::assertion(LocalMemSize >= 0); - return ReturnValue(static_cast(LocalMemSize)); - } - } - case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: { - int ECCEnabled = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, hDevice->get())); - - detail::ur::assertion((ECCEnabled == 0) | (ECCEnabled == 1)); - auto Result = static_cast(ECCEnabled); - return ReturnValue(Result); - } - case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY: { - int IsIntegrated = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &IsIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, hDevice->get())); - - detail::ur::assertion((IsIntegrated == 0) | (IsIntegrated == 1)); - auto result = static_cast(IsIntegrated); - return ReturnValue(result); - } - case UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION: { - // Hard coded to value returned by clinfo for OpenCL 1.2 CUDA | GeForce GTX - // 1060 3GB - return ReturnValue(1000lu); - } - case UR_DEVICE_INFO_ENDIAN_LITTLE: { - return ReturnValue(true); - } - case UR_DEVICE_INFO_AVAILABLE: { - return ReturnValue(true); - } - case UR_DEVICE_INFO_BUILD_ON_SUBDEVICE: { - return ReturnValue(true); - } - case UR_DEVICE_INFO_COMPILER_AVAILABLE: { - return ReturnValue(true); - } - case UR_DEVICE_INFO_LINKER_AVAILABLE: { - return ReturnValue(true); - } - case UR_DEVICE_INFO_EXECUTION_CAPABILITIES: { - auto Capability = ur_device_exec_capability_flags_t{ - UR_DEVICE_EXEC_CAPABILITY_FLAG_KERNEL}; - return ReturnValue(Capability); - } - case UR_DEVICE_INFO_QUEUE_PROPERTIES: - return ReturnValue( - ur_queue_flag_t(UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE | - UR_QUEUE_FLAG_PROFILING_ENABLE)); - case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES: { - // The mandated minimum capability: - ur_queue_flags_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE | - UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE; - return ReturnValue(Capability); - } - case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: { - // The mandated minimum capability: - ur_queue_flags_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE; - return ReturnValue(Capability); - } - case UR_DEVICE_INFO_BUILT_IN_KERNELS: { - // An empty string is returned if no built-in kernels are supported by the - // device. - return ReturnValue(""); - } - case UR_DEVICE_INFO_PLATFORM: { - return ReturnValue(hDevice->getPlatform()); - } - case UR_DEVICE_INFO_NAME: { - static constexpr size_t MaxDeviceNameLength = 256u; - char Name[MaxDeviceNameLength]; - UR_CHECK_ERROR(cuDeviceGetName(Name, MaxDeviceNameLength, hDevice->get())); - return ReturnValue(Name, strlen(Name) + 1); - } - case UR_DEVICE_INFO_VENDOR: { - return ReturnValue("NVIDIA Corporation"); - } - case UR_DEVICE_INFO_DRIVER_VERSION: { - auto Version = getCudaVersionString(); - return ReturnValue(Version.c_str()); - } - case UR_DEVICE_INFO_PROFILE: { - return ReturnValue("CUDA"); - } - case UR_DEVICE_INFO_REFERENCE_COUNT: { - return ReturnValue(hDevice->getReferenceCount()); - } - case UR_DEVICE_INFO_VERSION: { - std::stringstream SS; - int Major; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice->get())); - SS << Major; - int Minor; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &Minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice->get())); - SS << "." << Minor; - return ReturnValue(SS.str().c_str()); - } - case UR_EXT_DEVICE_INFO_OPENCL_C_VERSION: { - return ReturnValue(""); - } - case UR_DEVICE_INFO_EXTENSIONS: { - - std::string SupportedExtensions = "cl_khr_fp64 cl_khr_subgroups "; - SupportedExtensions += "pi_ext_intel_devicelib_assert "; - SupportedExtensions += " "; - - int Major = 0; - int Minor = 0; - - UR_CHECK_ERROR(cuDeviceGetAttribute( - &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice->get())); - UR_CHECK_ERROR(cuDeviceGetAttribute( - &Minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice->get())); - - if ((Major >= 6) || ((Major == 5) && (Minor >= 3))) { - SupportedExtensions += "cl_khr_fp16 "; - } - - return ReturnValue(SupportedExtensions.c_str()); - } - case UR_DEVICE_INFO_PRINTF_BUFFER_SIZE: { - // The minimum value for the FULL profile is 1 MB. - return ReturnValue(1024lu); - } - case UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC: { - return ReturnValue(true); - } - case UR_DEVICE_INFO_PARENT_DEVICE: { - return ReturnValue(nullptr); - } - case UR_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES: { - return ReturnValue(0u); - } - case UR_DEVICE_INFO_SUPPORTED_PARTITIONS: { - if (pPropSizeRet) { - *pPropSizeRet = 0; - } - return UR_RESULT_SUCCESS; - } - - case UR_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN: { - return ReturnValue(0u); - } - case UR_DEVICE_INFO_PARTITION_TYPE: { - if (pPropSizeRet) { - *pPropSizeRet = 0; - } - return UR_RESULT_SUCCESS; - } - - // Intel USM extensions - - case UR_DEVICE_INFO_USM_HOST_SUPPORT: { - // from cl_intel_unified_shared_memory: "The host memory access capabilities - // apply to any host allocation." - // - // query if/how the device can access page-locked host memory, possibly - // through PCIe, using the same pointer as the host - uint32_t Value = {}; - if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)) { - // the device shares a unified address space with the host - if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= - 6) { - // compute capability 6.x introduces operations that are atomic with - // respect to other CPUs and GPUs in the system - Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | - UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS | - UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS | - UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; - } else { - // on GPU architectures with compute capability lower than 6.x, atomic - // operations from the GPU to CPU memory will not be atomic with respect - // to CPU initiated atomic operations - Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | - UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS; - } - } - return ReturnValue(Value); - } - case UR_DEVICE_INFO_USM_DEVICE_SUPPORT: { - // from cl_intel_unified_shared_memory: - // "The device memory access capabilities apply to any device allocation - // associated with this device." - // - // query how the device can access memory allocated on the device itself (?) - uint32_t Value = - UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | - UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS | - UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS | - UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; - return ReturnValue(Value); - } - case UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT: { - // from cl_intel_unified_shared_memory: - // "The single device shared memory access capabilities apply to any shared - // allocation associated with this device." - // - // query if/how the device can access managed memory associated to it - uint32_t Value = {}; - if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) { - // the device can allocate managed memory on this system - Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | - UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS; - } - if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { - // the device can coherently access managed memory concurrently with the - // CPU - Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS; - if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= - 6) { - // compute capability 6.x introduces operations that are atomic with - // respect to other CPUs and GPUs in the system - Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; - } - } - return ReturnValue(Value); - } - case UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT: { - // from cl_intel_unified_shared_memory: - // "The cross-device shared memory access capabilities apply to any shared - // allocation associated with this device, or to any shared memory - // allocation on another device that also supports the same cross-device - // shared memory access capability." - // - // query if/how the device can access managed memory associated to other - // devices - uint32_t Value = {}; - if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) { - // the device can allocate managed memory on this system - Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS; - } - if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { - // all devices with the CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS - // attribute can coherently access managed memory concurrently with the - // CPU - Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS; - } - if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= - 6) { - // compute capability 6.x introduces operations that are atomic with - // respect to other CPUs and GPUs in the system - if (Value & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS) - Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS; - if (Value & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS) - Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; - } - return ReturnValue(Value); - } - case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: { - // from cl_intel_unified_shared_memory: - // "The shared system memory access capabilities apply to any allocations - // made by a system allocator, such as malloc or new." - // - // query if/how the device can access pageable host memory allocated by the - // system allocator - uint32_t Value = {}; - if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS)) { - // the device suppports coherently accessing pageable memory without - // calling cuMemHostRegister/cudaHostRegister on it - if (getAttribute(hDevice, - CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED)) { - // the link between the device and the host supports native atomic - // operations - Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | - UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS | - UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS | - UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; - } else { - // the link between the device and the host does not support native - // atomic operations - Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | - UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS; - } - } - return ReturnValue(Value); - } - case UR_DEVICE_INFO_ASYNC_BARRIER: { - int Value = getAttribute(hDevice, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 8; - return ReturnValue(static_cast(Value)); - } - case UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION: { - int Major = - getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR); - int Minor = - getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR); - std::string Result = std::to_string(Major) + "." + std::to_string(Minor); - return ReturnValue(Result.c_str()); - } - - case UR_DEVICE_INFO_GLOBAL_MEM_FREE: { - size_t FreeMemory = 0; - size_t TotalMemory = 0; - detail::ur::assertion(cuMemGetInfo(&FreeMemory, &TotalMemory) == - CUDA_SUCCESS, - "failed cuMemGetInfo() API."); - return ReturnValue(FreeMemory); - } - case UR_DEVICE_INFO_MEMORY_CLOCK_RATE: { - int Value = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &Value, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, hDevice->get())); - detail::ur::assertion(Value >= 0); - // Convert kilohertz to megahertz when returning. - return ReturnValue(Value / 1000); - } - case UR_DEVICE_INFO_MEMORY_BUS_WIDTH: { - int Value = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &Value, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, hDevice->get())); - detail::ur::assertion(Value >= 0); - return ReturnValue(Value); - } - case UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES: { - return ReturnValue(int32_t{1}); - } - case UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP: { - // On CUDA bindless images are supported. - return ReturnValue(true); - } - case UR_DEVICE_INFO_BINDLESS_IMAGES_SHARED_USM_SUPPORT_EXP: { - // On CUDA bindless images can be backed by shared (managed) USM. - return ReturnValue(true); - } - case UR_DEVICE_INFO_BINDLESS_IMAGES_1D_USM_SUPPORT_EXP: { - // On CUDA 1D bindless image USM is not supported. - // More specifically, linear filtering is not supported. - return ReturnValue(false); - } - case UR_DEVICE_INFO_BINDLESS_IMAGES_2D_USM_SUPPORT_EXP: { - // On CUDA 2D bindless image USM is supported. - return ReturnValue(true); - } - case UR_DEVICE_INFO_IMAGE_PITCH_ALIGN_EXP: { - int32_t tex_pitch_align = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &tex_pitch_align, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, - hDevice->get())); - return ReturnValue(tex_pitch_align); - } - case UR_DEVICE_INFO_MAX_IMAGE_LINEAR_WIDTH_EXP: { - int32_t tex_max_linear_width = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &tex_max_linear_width, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH, hDevice->get())); - return ReturnValue(tex_max_linear_width); - } - case UR_DEVICE_INFO_MAX_IMAGE_LINEAR_HEIGHT_EXP: { - int32_t tex_max_linear_height = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &tex_max_linear_height, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT, hDevice->get())); - return ReturnValue(tex_max_linear_height); - } - case UR_DEVICE_INFO_MAX_IMAGE_LINEAR_PITCH_EXP: { - int32_t tex_max_linear_pitch = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &tex_max_linear_pitch, - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH, hDevice->get())); - return ReturnValue(tex_max_linear_pitch); - } - case UR_DEVICE_INFO_MIPMAP_SUPPORT_EXP: { - // CUDA supports mipmaps. - return ReturnValue(true); - } - case UR_DEVICE_INFO_MIPMAP_ANISOTROPY_SUPPORT_EXP: { - // CUDA supports anisotropic filtering. - return ReturnValue(true); - } - case UR_DEVICE_INFO_MIPMAP_MAX_ANISOTROPY_EXP: { - // CUDA has no query for this, but documentation states max value is 16. - return ReturnValue(16.f); - } - case UR_DEVICE_INFO_MIPMAP_LEVEL_REFERENCE_SUPPORT_EXP: { - // CUDA supports creation of images from individual mipmap levels. - return ReturnValue(true); - } - - case UR_DEVICE_INFO_INTEROP_MEMORY_IMPORT_SUPPORT_EXP: { - // CUDA supports importing external memory. - return ReturnValue(true); - } - case UR_DEVICE_INFO_INTEROP_MEMORY_EXPORT_SUPPORT_EXP: { - // CUDA does not support exporting it's own device memory. - return ReturnValue(false); - } - case UR_DEVICE_INFO_INTEROP_SEMAPHORE_IMPORT_SUPPORT_EXP: { - // CUDA supports importing external semaphores. - return ReturnValue(true); - } - case UR_DEVICE_INFO_INTEROP_SEMAPHORE_EXPORT_SUPPORT_EXP: { - // CUDA does not support exporting semaphores or events. - return ReturnValue(false); - } - case UR_DEVICE_INFO_DEVICE_ID: { - int Value = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &Value, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, hDevice->get())); - detail::ur::assertion(Value >= 0); - return ReturnValue(Value); - } - case UR_DEVICE_INFO_UUID: { - CUuuid UUID; -#if (CUDA_VERSION >= 11040) - detail::ur::assertion(cuDeviceGetUuid_v2(&UUID, hDevice->get()) == - CUDA_SUCCESS); -#else - detail::ur::assertion(cuDeviceGetUuid(&UUID, hDevice->get()) == - CUDA_SUCCESS); -#endif - std::array Name; - std::copy(UUID.bytes, UUID.bytes + 16, Name.begin()); - return ReturnValue(Name.data(), 16); - } - case UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH: { - int Major = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice->get())); - - int Minor = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &Minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice->get())); - - // Some specific devices seem to need special handling. See reference - // https://github.com/jeffhammond/HPCInfo/blob/master/cuda/gpu-detect.cu - bool IsXavierAGX = Major == 7 && Minor == 2; - bool IsOrinAGX = Major == 8 && Minor == 7; - - int MemoryClockKHz = 0; - if (IsXavierAGX) { - MemoryClockKHz = 2133000; - } else if (IsOrinAGX) { - MemoryClockKHz = 3200000; - } else { - UR_CHECK_ERROR(cuDeviceGetAttribute(&MemoryClockKHz, - CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, - hDevice->get())); - } - - int MemoryBusWidth = 0; - if (IsOrinAGX) { - MemoryBusWidth = 256; - } else { - UR_CHECK_ERROR(cuDeviceGetAttribute( - &MemoryBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, - hDevice->get())); - } - - uint32_t MemoryBandwidth = MemoryClockKHz * MemoryBusWidth * 250; - - return ReturnValue(MemoryBandwidth); - } - case UR_DEVICE_INFO_IL_VERSION: { - std::string ILVersion = "nvptx-"; - - int DriverVersion = 0; - cuDriverGetVersion(&DriverVersion); - int Major = DriverVersion / 1000; - int Minor = DriverVersion % 1000 / 10; - - // We can work out which ptx ISA version we support based on the versioning - // table published here - // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release-notes - // Major versions that we support are consistent in how they line up, so we - // can derive that easily. The minor versions for version 10 don't line up - // the same so it needs a special case. This is not ideal but it does seem - // to be the best bet to avoid a maintenance burden here. - ILVersion += std::to_string(Major - 4) + "."; - if (Major == 10) { - ILVersion += std::to_string(Minor + 3); - } else if (Major >= 11) { - ILVersion += std::to_string(Minor); - } else { - return UR_RESULT_ERROR_INVALID_VALUE; - } - - return ReturnValue(ILVersion.data(), ILVersion.size()); - } - case UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP: { - // Maximum number of 32-bit registers available to a thread block. - // Note: This number is shared by all thread blocks simultaneously resident - // on a multiprocessor. - int MaxRegisters{-1}; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &MaxRegisters, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, - hDevice->get())); - - detail::ur::assertion(MaxRegisters >= 0); - - return ReturnValue(static_cast(MaxRegisters)); - } - case UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT: - return ReturnValue(false); - case UR_DEVICE_INFO_IMAGE_SRGB: - return ReturnValue(false); - case UR_DEVICE_INFO_PCI_ADDRESS: { - constexpr size_t AddressBufferSize = 13; - char AddressBuffer[AddressBufferSize]; - UR_CHECK_ERROR( - cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, hDevice->get())); - // CUDA API (8.x - 12.1) guarantees 12 bytes + \0 are written - detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) == 12); - return ReturnValue(AddressBuffer, - strnlen(AddressBuffer, AddressBufferSize - 1) + 1); - } - case UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS: - return ReturnValue(false); - // TODO: Investigate if this information is available on CUDA. - case UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED: - return ReturnValue(false); - case UR_DEVICE_INFO_ESIMD_SUPPORT: - return ReturnValue(false); - case UR_DEVICE_INFO_MAX_READ_WRITE_IMAGE_ARGS: - case UR_DEVICE_INFO_GPU_EU_COUNT: - case UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH: - case UR_DEVICE_INFO_GPU_EU_SLICES: - case UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE: - case UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE: - case UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU: - case UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT: - return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; - - default: - break; - } - return UR_RESULT_ERROR_INVALID_ENUMERATION; -} - -/// \return PI_SUCCESS if the function is executed successfully -/// CUDA devices are always root devices so retain always returns success. -UR_APIEXPORT ur_result_t UR_APICALL urDeviceRetain(ur_device_handle_t hDevice) { - std::ignore = hDevice; - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL -urDevicePartition(ur_device_handle_t, const ur_device_partition_properties_t *, - uint32_t, ur_device_handle_t *, uint32_t *) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -/// \return UR_RESULT_SUCCESS always since CUDA devices are always root -/// devices. -UR_APIEXPORT ur_result_t UR_APICALL -urDeviceRelease(ur_device_handle_t hDevice) { - std::ignore = hDevice; - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(ur_platform_handle_t hPlatform, - ur_device_type_t DeviceType, - uint32_t NumEntries, - ur_device_handle_t *phDevices, - uint32_t *pNumDevices) { - ur_result_t Result = UR_RESULT_SUCCESS; - const bool AskingForAll = DeviceType == UR_DEVICE_TYPE_ALL; - const bool AskingForDefault = DeviceType == UR_DEVICE_TYPE_DEFAULT; - const bool AskingForGPU = DeviceType == UR_DEVICE_TYPE_GPU; - const bool ReturnDevices = AskingForDefault || AskingForAll || AskingForGPU; - - size_t NumDevices = ReturnDevices ? hPlatform->Devices.size() : 0; - - try { - if (pNumDevices) { - *pNumDevices = NumDevices; - } - - if (ReturnDevices && phDevices) { - for (size_t i = 0; i < std::min(size_t(NumEntries), NumDevices); ++i) { - phDevices[i] = hPlatform->Devices[i].get(); - } - } - - return Result; - } catch (ur_result_t Err) { - return Err; - } catch (...) { - return UR_RESULT_ERROR_OUT_OF_RESOURCES; - } -} - -/// Gets the native CUDA handle of a UR device object -/// -/// \param[in] device The UR device to get the native CUDA object of. -/// \param[out] nativeHandle Set to the native handle of the UR device object. -/// -/// \return PI_SUCCESS - -UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle( - ur_device_handle_t hDevice, ur_native_handle_t *phNativeHandle) { - *phNativeHandle = reinterpret_cast(hDevice->get()); - return UR_RESULT_SUCCESS; -} - -/// Created a UR device object from a CUDA device handle. -/// NOTE: The created UR object does not take ownership of the native handle. -/// -/// \param[in] nativeHandle The native handle to create UR device object from. -/// \param[in] platform is the UR platform of the device. -/// \param[out] device Set to the UR device object created from native handle. -/// -/// \return TBD - -UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( - ur_native_handle_t hNativeDevice, ur_platform_handle_t hPlatform, - const ur_device_native_properties_t *pProperties, - ur_device_handle_t *phDevice) { - std::ignore = pProperties; - - // We can't cast between ur_native_handle_t and CUdevice, so memcpy the bits - // instead - CUdevice CuDevice = 0; - memcpy(&CuDevice, &hNativeDevice, sizeof(CUdevice)); - - auto IsDevice = [=](std::unique_ptr &Dev) { - return Dev->get() == CuDevice; - }; - - // If a platform is provided just check if the device is in it - if (hPlatform) { - auto SearchRes = std::find_if(begin(hPlatform->Devices), - end(hPlatform->Devices), IsDevice); - if (SearchRes != end(hPlatform->Devices)) { - *phDevice = SearchRes->get(); - return UR_RESULT_SUCCESS; - } - } - - // Get list of platforms - uint32_t NumPlatforms = 0; - ur_adapter_handle_t AdapterHandle = &adapter; - ur_result_t Result = - urPlatformGet(&AdapterHandle, 1, 0, nullptr, &NumPlatforms); - if (Result != UR_RESULT_SUCCESS) - return Result; - - ur_platform_handle_t *Plat = static_cast( - malloc(NumPlatforms * sizeof(ur_platform_handle_t))); - Result = urPlatformGet(&AdapterHandle, 1, NumPlatforms, Plat, nullptr); - if (Result != UR_RESULT_SUCCESS) - return Result; - - // Iterate through platforms to find device that matches nativeHandle - for (uint32_t j = 0; j < NumPlatforms; ++j) { - auto SearchRes = - std::find_if(begin(Plat[j]->Devices), end(Plat[j]->Devices), IsDevice); - if (SearchRes != end(Plat[j]->Devices)) { - *phDevice = static_cast((*SearchRes).get()); - return UR_RESULT_SUCCESS; - } - } - - // If the provided nativeHandle cannot be matched to an - // existing device return error - return UR_RESULT_ERROR_INVALID_OPERATION; -} - -ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice, - uint64_t *pDeviceTimestamp, - uint64_t *pHostTimestamp) { - CUevent Event; - ScopedContext Active(hDevice->getContext()); - - if (pDeviceTimestamp) { - UR_CHECK_ERROR(cuEventCreate(&Event, CU_EVENT_DEFAULT)); - UR_CHECK_ERROR(cuEventRecord(Event, 0)); - } - if (pHostTimestamp) { - - using namespace std::chrono; - *pHostTimestamp = - duration_cast(steady_clock::now().time_since_epoch()) - .count(); - } - - if (pDeviceTimestamp) { - UR_CHECK_ERROR(cuEventSynchronize(Event)); - *pDeviceTimestamp = hDevice->getElapsedTime(Event); - } - - return UR_RESULT_SUCCESS; -} - -/// \return If available, the first binary that is PTX -/// -UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary( - ur_device_handle_t hDevice, const ur_device_binary_t *pBinaries, - uint32_t NumBinaries, uint32_t *pSelectedBinary) { - std::ignore = hDevice; - - // Look for an image for the NVPTX64 target, and return the first one that is - // found - for (uint32_t i = 0; i < NumBinaries; i++) { - if (strcmp(pBinaries[i].pDeviceTargetSpec, - UR_DEVICE_BINARY_TARGET_NVPTX64) == 0) { - *pSelectedBinary = i; - return UR_RESULT_SUCCESS; - } - } - - // No image can be loaded for the given device - return UR_RESULT_ERROR_INVALID_BINARY; -} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp deleted file mode 100644 index 696630bd10ca0..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp +++ /dev/null @@ -1,119 +0,0 @@ -//===--------- device.hpp - CUDA Adapter ----------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#pragma once - -#include - -struct ur_device_handle_t_ { -private: - using native_type = CUdevice; - - native_type CuDevice; - CUcontext CuContext; - CUevent EvBase; // CUDA event used as base counter - std::atomic_uint32_t RefCount; - ur_platform_handle_t Platform; - - static constexpr uint32_t MaxWorkItemDimensions = 3u; - size_t MaxWorkItemSizes[MaxWorkItemDimensions]; - size_t MaxWorkGroupSize{0}; - size_t MaxAllocSize{0}; - int MaxBlockDimY{0}; - int MaxBlockDimZ{0}; - int MaxRegsPerBlock{0}; - int MaxCapacityLocalMem{0}; - int MaxChosenLocalMem{0}; - bool MaxLocalMemSizeChosen{false}; - -public: - ur_device_handle_t_(native_type cuDevice, CUcontext cuContext, CUevent evBase, - ur_platform_handle_t platform) - : CuDevice(cuDevice), CuContext(cuContext), EvBase(evBase), RefCount{1}, - Platform(platform) { - - UR_CHECK_ERROR(cuDeviceGetAttribute( - &MaxBlockDimY, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, cuDevice)); - UR_CHECK_ERROR(cuDeviceGetAttribute( - &MaxBlockDimZ, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, cuDevice)); - UR_CHECK_ERROR(cuDeviceGetAttribute( - &MaxRegsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, - cuDevice)); - - // Set local mem max size if env var is present - static const char *LocalMemSizePtrUR = - std::getenv("UR_CUDA_MAX_LOCAL_MEM_SIZE"); - static const char *LocalMemSizePtrPI = - std::getenv("SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE"); - static const char *LocalMemSizePtr = - LocalMemSizePtrUR ? LocalMemSizePtrUR - : (LocalMemSizePtrPI ? LocalMemSizePtrPI : nullptr); - - if (LocalMemSizePtr) { - cuDeviceGetAttribute( - &MaxCapacityLocalMem, - CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, cuDevice); - MaxChosenLocalMem = std::atoi(LocalMemSizePtr); - MaxLocalMemSizeChosen = true; - } - - // Max size of memory object allocation in bytes. - // The minimum value is max(min(1024 × 1024 × - // 1024, 1/4th of CL_DEVICE_GLOBAL_MEM_SIZE), - // 32 × 1024 × 1024) for devices that are not of type - // CL_DEVICE_TYPE_CUSTOM. - size_t Global = 0; - UR_CHECK_ERROR(cuDeviceTotalMem(&Global, cuDevice)); - - auto QuarterGlobal = static_cast(Global / 4u); - - MaxAllocSize = std::max(std::min(1024u * 1024u * 1024u, QuarterGlobal), - 32u * 1024u * 1024u); - } - - ~ur_device_handle_t_() { cuDevicePrimaryCtxRelease(CuDevice); } - - native_type get() const noexcept { return CuDevice; }; - - CUcontext getContext() const noexcept { return CuContext; }; - - uint32_t getReferenceCount() const noexcept { return RefCount; } - - ur_platform_handle_t getPlatform() const noexcept { return Platform; }; - - uint64_t getElapsedTime(CUevent) const; - - void saveMaxWorkItemSizes(size_t Size, - size_t *SaveMaxWorkItemSizes) noexcept { - memcpy(MaxWorkItemSizes, SaveMaxWorkItemSizes, Size); - }; - - void saveMaxWorkGroupSize(int Value) noexcept { MaxWorkGroupSize = Value; }; - - void getMaxWorkItemSizes(size_t RetSize, - size_t *RetMaxWorkItemSizes) const noexcept { - memcpy(RetMaxWorkItemSizes, MaxWorkItemSizes, RetSize); - }; - - size_t getMaxWorkGroupSize() const noexcept { return MaxWorkGroupSize; }; - - size_t getMaxBlockDimY() const noexcept { return MaxBlockDimY; }; - - size_t getMaxBlockDimZ() const noexcept { return MaxBlockDimZ; }; - - size_t getMaxRegsPerBlock() const noexcept { return MaxRegsPerBlock; }; - - size_t getMaxAllocSize() const noexcept { return MaxAllocSize; }; - - int getMaxCapacityLocalMem() const noexcept { return MaxCapacityLocalMem; }; - - int getMaxChosenLocalMem() const noexcept { return MaxChosenLocalMem; }; - - bool maxLocalMemSizeChosen() { return MaxLocalMemSizeChosen; }; -}; - -int getAttribute(ur_device_handle_t Device, CUdevice_attribute Attribute); diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp deleted file mode 100644 index ec1adce808681..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp +++ /dev/null @@ -1,1690 +0,0 @@ -//===--------- enqueue.cpp - CUDA Adapter ---------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "enqueue.hpp" -#include "common.hpp" -#include "context.hpp" -#include "event.hpp" -#include "kernel.hpp" -#include "memory.hpp" -#include "queue.hpp" - -#include -#include - -ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream, - uint32_t NumEventsInWaitList, - const ur_event_handle_t *EventWaitList) { - UR_ASSERT(EventWaitList, UR_RESULT_SUCCESS); - - try { - ScopedContext Active(CommandQueue->getContext()); - - auto Result = forLatestEvents( - EventWaitList, NumEventsInWaitList, - [Stream](ur_event_handle_t Event) -> ur_result_t { - if (Event->getStream() == Stream) { - return UR_RESULT_SUCCESS; - } else { - UR_CHECK_ERROR(cuStreamWaitEvent(Stream, Event->get(), 0)); - return UR_RESULT_SUCCESS; - } - }); - return Result; - } catch (ur_result_t Err) { - return Err; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } -} - -template -void getUSMHostOrDevicePtr(PtrT USMPtr, CUmemorytype *OutMemType, - CUdeviceptr *OutDevPtr, PtrT *OutHostPtr) { - // do not throw if cuPointerGetAttribute returns CUDA_ERROR_INVALID_VALUE - // checks with PI_CHECK_ERROR are not suggested - CUresult Ret = cuPointerGetAttribute( - OutMemType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)USMPtr); - // ARRAY, UNIFIED types are not supported! - assert(*OutMemType != CU_MEMORYTYPE_ARRAY && - *OutMemType != CU_MEMORYTYPE_UNIFIED); - - // pointer not known to the CUDA subsystem (possibly a system allocated ptr) - if (Ret == CUDA_ERROR_INVALID_VALUE) { - *OutMemType = CU_MEMORYTYPE_HOST; - *OutDevPtr = 0; - *OutHostPtr = USMPtr; - - // todo: resets the above "non-stick" error - } else if (Ret == CUDA_SUCCESS) { - *OutDevPtr = (*OutMemType == CU_MEMORYTYPE_DEVICE) - ? reinterpret_cast(USMPtr) - : 0; - *OutHostPtr = (*OutMemType == CU_MEMORYTYPE_HOST) ? USMPtr : nullptr; - } else { - UR_CHECK_ERROR(Ret); - } -} - -ur_result_t setCuMemAdvise(CUdeviceptr DevPtr, size_t Size, - ur_usm_advice_flags_t URAdviceFlags, - CUdevice Device) { - std::unordered_map - URToCUMemAdviseDeviceFlagsMap = { - {UR_USM_ADVICE_FLAG_SET_READ_MOSTLY, CU_MEM_ADVISE_SET_READ_MOSTLY}, - {UR_USM_ADVICE_FLAG_CLEAR_READ_MOSTLY, - CU_MEM_ADVISE_UNSET_READ_MOSTLY}, - {UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION, - CU_MEM_ADVISE_SET_PREFERRED_LOCATION}, - {UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION, - CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION}, - {UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE, - CU_MEM_ADVISE_SET_ACCESSED_BY}, - {UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_DEVICE, - CU_MEM_ADVISE_UNSET_ACCESSED_BY}, - }; - for (auto &FlagPair : URToCUMemAdviseDeviceFlagsMap) { - if (URAdviceFlags & FlagPair.first) { - UR_CHECK_ERROR(cuMemAdvise(DevPtr, Size, FlagPair.second, Device)); - } - } - - std::unordered_map - URToCUMemAdviseHostFlagsMap = { - {UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION_HOST, - CU_MEM_ADVISE_SET_PREFERRED_LOCATION}, - {UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION_HOST, - CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION}, - {UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_HOST, - CU_MEM_ADVISE_SET_ACCESSED_BY}, - {UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_HOST, - CU_MEM_ADVISE_UNSET_ACCESSED_BY}, - }; - - for (auto &FlagPair : URToCUMemAdviseHostFlagsMap) { - if (URAdviceFlags & FlagPair.first) { - UR_CHECK_ERROR(cuMemAdvise(DevPtr, Size, FlagPair.second, CU_DEVICE_CPU)); - } - } - - std::array UnmappedMemAdviceFlags = { - UR_USM_ADVICE_FLAG_SET_NON_ATOMIC_MOSTLY, - UR_USM_ADVICE_FLAG_CLEAR_NON_ATOMIC_MOSTLY, - UR_USM_ADVICE_FLAG_BIAS_CACHED, UR_USM_ADVICE_FLAG_BIAS_UNCACHED}; - - for (auto &UnmappedFlag : UnmappedMemAdviceFlags) { - if (URAdviceFlags & UnmappedFlag) { - throw UR_RESULT_ERROR_INVALID_ENUMERATION; - } - } - - return UR_RESULT_SUCCESS; -} - -// Determine local work sizes that result in uniform work groups. -// The default threadsPerBlock only require handling the first work_dim -// dimension. -void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock, - const size_t *GlobalWorkSize, const uint32_t WorkDim, - const size_t MaxThreadsPerBlock[3], - ur_kernel_handle_t Kernel, uint32_t LocalSize) { - assert(ThreadsPerBlock != nullptr); - assert(GlobalWorkSize != nullptr); - assert(Kernel != nullptr); - int MinGrid, MaxBlockSize; - size_t MaxBlockDim[3]; - - // The below assumes a three dimensional range but this is not guaranteed by - // UR. - size_t GlobalSizeNormalized[3] = {1, 1, 1}; - for (uint32_t i = 0; i < WorkDim; i++) { - GlobalSizeNormalized[i] = GlobalWorkSize[i]; - } - - MaxBlockDim[1] = Device->getMaxBlockDimY(); - MaxBlockDim[2] = Device->getMaxBlockDimZ(); - - UR_CHECK_ERROR( - cuOccupancyMaxPotentialBlockSize(&MinGrid, &MaxBlockSize, Kernel->get(), - NULL, LocalSize, MaxThreadsPerBlock[0])); - - ThreadsPerBlock[2] = std::min(GlobalSizeNormalized[2], MaxBlockDim[2]); - ThreadsPerBlock[1] = - std::min(GlobalSizeNormalized[1], - std::min(MaxBlockSize / ThreadsPerBlock[2], MaxBlockDim[1])); - MaxBlockDim[0] = MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[2]); - ThreadsPerBlock[0] = std::min( - MaxThreadsPerBlock[0], std::min(GlobalSizeNormalized[0], MaxBlockDim[0])); - - static auto IsPowerOf2 = [](size_t Value) -> bool { - return Value && !(Value & (Value - 1)); - }; - - // Find a local work group size that is a divisor of the global - // work group size to produce uniform work groups. - // Additionally, for best compute utilisation, the local size has - // to be a power of two. - while (0u != (GlobalSizeNormalized[0] % ThreadsPerBlock[0]) || - !IsPowerOf2(ThreadsPerBlock[0])) { - --ThreadsPerBlock[0]; - } -} - -// Helper to verify out-of-registers case (exceeded block max registers). -// If the kernel requires a number of registers for the entire thread -// block exceeds the hardware limitations, then the cuLaunchKernel call -// will fail to launch with CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES error. -bool hasExceededMaxRegistersPerBlock(ur_device_handle_t Device, - ur_kernel_handle_t Kernel, - size_t BlockSize) { - return BlockSize * Kernel->getRegsPerThread() > Device->getMaxRegsPerBlock(); -} - -/// Enqueues a wait on the given CUstream for all specified events (See -/// \ref enqueueEventWaitWithBarrier.) If the events list is empty, the enqueued -/// wait will wait on all previous events in the queue. -/// -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( - ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - // This function makes one stream work on the previous work (or work - // represented by input events) and then all future work waits on that stream. - try { - ScopedContext Active(hQueue->getContext()); - uint32_t StreamToken; - ur_stream_guard_ Guard; - CUstream CuStream = hQueue->getNextComputeStream( - numEventsInWaitList, phEventWaitList, Guard, &StreamToken); - { - std::lock_guard GuardBarrier(hQueue->BarrierMutex); - if (hQueue->BarrierEvent == nullptr) { - UR_CHECK_ERROR( - cuEventCreate(&hQueue->BarrierEvent, CU_EVENT_DISABLE_TIMING)); - } - if (numEventsInWaitList == 0) { // wait on all work - if (hQueue->BarrierTmpEvent == nullptr) { - UR_CHECK_ERROR( - cuEventCreate(&hQueue->BarrierTmpEvent, CU_EVENT_DISABLE_TIMING)); - } - hQueue->syncStreams( - [CuStream, TmpEvent = hQueue->BarrierTmpEvent](CUstream s) { - if (CuStream != s) { - // record a new CUDA event on every stream and make one stream - // wait for these events - UR_CHECK_ERROR(cuEventRecord(TmpEvent, s)); - UR_CHECK_ERROR(cuStreamWaitEvent(CuStream, TmpEvent, 0)); - } - }); - } else { // wait just on given events - forLatestEvents(phEventWaitList, numEventsInWaitList, - [CuStream](ur_event_handle_t Event) -> ur_result_t { - if (Event->getQueue()->hasBeenSynchronized( - Event->getComputeStreamToken())) { - return UR_RESULT_SUCCESS; - } else { - UR_CHECK_ERROR( - cuStreamWaitEvent(CuStream, Event->get(), 0)); - return UR_RESULT_SUCCESS; - } - }); - } - - UR_CHECK_ERROR(cuEventRecord(hQueue->BarrierEvent, CuStream)); - for (unsigned int i = 0; i < hQueue->ComputeAppliedBarrier.size(); i++) { - hQueue->ComputeAppliedBarrier[i] = false; - } - for (unsigned int i = 0; i < hQueue->TransferAppliedBarrier.size(); i++) { - hQueue->TransferAppliedBarrier[i] = false; - } - } - - if (phEvent) { - *phEvent = ur_event_handle_t_::makeNative( - UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, hQueue, CuStream, StreamToken); - UR_CHECK_ERROR((*phEvent)->start()); - UR_CHECK_ERROR((*phEvent)->record()); - } - - return UR_RESULT_SUCCESS; - } catch (ur_result_t Err) { - return Err; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } -} - -/// Enqueues a wait on the given CUstream for all events. -/// See \ref enqueueEventWait -/// TODO: Add support for multiple streams once the Event class is properly -/// refactored. -/// -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( - ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList, - phEventWaitList, phEvent); -} - -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( - ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - // Preconditions - UR_ASSERT(hQueue->getContext() == hKernel->getContext(), - UR_RESULT_ERROR_INVALID_KERNEL); - UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - - if (*pGlobalWorkSize == 0) { - return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList, - phEventWaitList, phEvent); - } - - // Set the number of threads per block to the number of threads per warp - // by default unless user has provided a better number - size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; - size_t MaxWorkGroupSize = 0u; - size_t MaxThreadsPerBlock[3] = {}; - bool ProvidedLocalWorkGroupSize = (pLocalWorkSize != nullptr); - uint32_t LocalSize = hKernel->getLocalSize(); - ur_result_t Result = UR_RESULT_SUCCESS; - - try { - // Set the active context here as guessLocalWorkSize needs an active context - ScopedContext Active(hQueue->getContext()); - { - size_t *ReqdThreadsPerBlock = hKernel->ReqdThreadsPerBlock; - MaxWorkGroupSize = hQueue->Device->getMaxWorkGroupSize(); - hQueue->Device->getMaxWorkItemSizes(sizeof(MaxThreadsPerBlock), - MaxThreadsPerBlock); - - if (ProvidedLocalWorkGroupSize) { - auto IsValid = [&](int Dim) { - if (ReqdThreadsPerBlock[Dim] != 0 && - pLocalWorkSize[Dim] != ReqdThreadsPerBlock[Dim]) - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - - if (pLocalWorkSize[Dim] > MaxThreadsPerBlock[Dim]) - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - // Checks that local work sizes are a divisor of the global work sizes - // which includes that the local work sizes are neither larger than - // the global work sizes and not 0. - if (0u == pLocalWorkSize[Dim]) - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - if (0u != (pGlobalWorkSize[Dim] % pLocalWorkSize[Dim])) - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - ThreadsPerBlock[Dim] = pLocalWorkSize[Dim]; - return UR_RESULT_SUCCESS; - }; - - size_t KernelLocalWorkGroupSize = 0; - for (size_t Dim = 0; Dim < workDim; Dim++) { - auto Err = IsValid(Dim); - if (Err != UR_RESULT_SUCCESS) - return Err; - // If no error then sum the total local work size per dim. - KernelLocalWorkGroupSize += pLocalWorkSize[Dim]; - } - - if (hasExceededMaxRegistersPerBlock(hQueue->Device, hKernel, - KernelLocalWorkGroupSize)) { - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - } - } else { - guessLocalWorkSize(hQueue->Device, ThreadsPerBlock, pGlobalWorkSize, - workDim, MaxThreadsPerBlock, hKernel, LocalSize); - } - } - - if (MaxWorkGroupSize < - ThreadsPerBlock[0] * ThreadsPerBlock[1] * ThreadsPerBlock[2]) { - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - } - - size_t BlocksPerGrid[3] = {1u, 1u, 1u}; - - for (size_t i = 0; i < workDim; i++) { - BlocksPerGrid[i] = - (pGlobalWorkSize[i] + ThreadsPerBlock[i] - 1) / ThreadsPerBlock[i]; - } - - std::unique_ptr RetImplEvent{nullptr}; - - uint32_t StreamToken; - ur_stream_guard_ Guard; - CUstream CuStream = hQueue->getNextComputeStream( - numEventsInWaitList, phEventWaitList, Guard, &StreamToken); - CUfunction CuFunc = hKernel->get(); - - Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, - phEventWaitList); - - // Set the implicit global offset parameter if kernel has offset variant - if (hKernel->get_with_offset_parameter()) { - std::uint32_t CudaImplicitOffset[3] = {0, 0, 0}; - if (pGlobalWorkOffset) { - for (size_t i = 0; i < workDim; i++) { - CudaImplicitOffset[i] = - static_cast(pGlobalWorkOffset[i]); - if (pGlobalWorkOffset[i] != 0) { - CuFunc = hKernel->get_with_offset_parameter(); - } - } - } - hKernel->setImplicitOffsetArg(sizeof(CudaImplicitOffset), - CudaImplicitOffset); - } - - auto &ArgIndices = hKernel->getArgIndices(); - - if (phEvent) { - RetImplEvent = - std::unique_ptr(ur_event_handle_t_::makeNative( - UR_COMMAND_KERNEL_LAUNCH, hQueue, CuStream, StreamToken)); - UR_CHECK_ERROR(RetImplEvent->start()); - } - - if (hQueue->getContext()->getDevice()->maxLocalMemSizeChosen()) { - // Set up local memory requirements for kernel. - auto Device = hQueue->getContext()->getDevice(); - if (Device->getMaxChosenLocalMem() < 0) { - bool EnvVarHasURPrefix = - (std::getenv("UR_CUDA_MAX_LOCAL_MEM_SIZE") != nullptr); - setErrorMessage(EnvVarHasURPrefix ? "Invalid value specified for " - "UR_CUDA_MAX_LOCAL_MEM_SIZE" - : "Invalid value specified for " - "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE", - UR_RESULT_ERROR_ADAPTER_SPECIFIC); - return UR_RESULT_ERROR_ADAPTER_SPECIFIC; - } - if (LocalSize > static_cast(Device->getMaxCapacityLocalMem())) { - setErrorMessage("Too much local memory allocated for device", - UR_RESULT_ERROR_ADAPTER_SPECIFIC); - return UR_RESULT_ERROR_ADAPTER_SPECIFIC; - } - if (LocalSize > static_cast(Device->getMaxChosenLocalMem())) { - bool EnvVarHasURPrefix = - (std::getenv("UR_CUDA_MAX_LOCAL_MEM_SIZE") != nullptr); - setErrorMessage( - EnvVarHasURPrefix - ? "Local memory for kernel exceeds the amount requested using " - "UR_CUDA_MAX_LOCAL_MEM_SIZE. Try increasing the value of " - "UR_CUDA_MAX_LOCAL_MEM_SIZE." - : "Local memory for kernel exceeds the amount requested using " - "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE. Try increasing the the " - "value of SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE.", - UR_RESULT_ERROR_ADAPTER_SPECIFIC); - return UR_RESULT_ERROR_ADAPTER_SPECIFIC; - } - UR_CHECK_ERROR(cuFuncSetAttribute( - CuFunc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, - Device->getMaxChosenLocalMem())); - } - - UR_CHECK_ERROR(cuLaunchKernel( - CuFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2], - ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], LocalSize, - CuStream, const_cast(ArgIndices.data()), nullptr)); - if (LocalSize != 0) - hKernel->clearLocalSize(); - - if (phEvent) { - UR_CHECK_ERROR(RetImplEvent->record()); - *phEvent = RetImplEvent.release(); - } - } catch (ur_result_t Err) { - Result = Err; - } - return Result; -} - -/// General 3D memory copy operation. -/// This function requires the corresponding CUDA context to be at the top of -/// the context stack -/// If the source and/or destination is on the device, SrcPtr and/or DstPtr -/// must be a pointer to a CUdeviceptr -static ur_result_t commonEnqueueMemBufferCopyRect( - CUstream cu_stream, ur_rect_region_t region, const void *SrcPtr, - const CUmemorytype_enum SrcType, ur_rect_offset_t src_offset, - size_t src_row_pitch, size_t src_slice_pitch, void *DstPtr, - const CUmemorytype_enum DstType, ur_rect_offset_t dst_offset, - size_t dst_row_pitch, size_t dst_slice_pitch) { - - UR_ASSERT(SrcType == CU_MEMORYTYPE_DEVICE || SrcType == CU_MEMORYTYPE_HOST, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(DstType == CU_MEMORYTYPE_DEVICE || DstType == CU_MEMORYTYPE_HOST, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); - - src_row_pitch = - (!src_row_pitch) ? region.width + src_offset.x : src_row_pitch; - src_slice_pitch = (!src_slice_pitch) - ? ((region.height + src_offset.y) * src_row_pitch) - : src_slice_pitch; - dst_row_pitch = - (!dst_row_pitch) ? region.width + dst_offset.x : dst_row_pitch; - dst_slice_pitch = (!dst_slice_pitch) - ? ((region.height + dst_offset.y) * dst_row_pitch) - : dst_slice_pitch; - - CUDA_MEMCPY3D params = {}; - - params.WidthInBytes = region.width; - params.Height = region.height; - params.Depth = region.depth; - - params.srcMemoryType = SrcType; - params.srcDevice = SrcType == CU_MEMORYTYPE_DEVICE - ? *static_cast(SrcPtr) - : 0; - params.srcHost = SrcType == CU_MEMORYTYPE_HOST ? SrcPtr : nullptr; - params.srcXInBytes = src_offset.x; - params.srcY = src_offset.y; - params.srcZ = src_offset.z; - params.srcPitch = src_row_pitch; - params.srcHeight = src_slice_pitch / src_row_pitch; - - params.dstMemoryType = DstType; - params.dstDevice = - DstType == CU_MEMORYTYPE_DEVICE ? *static_cast(DstPtr) : 0; - params.dstHost = DstType == CU_MEMORYTYPE_HOST ? DstPtr : nullptr; - params.dstXInBytes = dst_offset.x; - params.dstY = dst_offset.y; - params.dstZ = dst_offset.z; - params.dstPitch = dst_row_pitch; - params.dstHeight = dst_slice_pitch / dst_row_pitch; - - UR_CHECK_ERROR(cuMemcpy3DAsync(¶ms, cu_stream)); - - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect( - ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingRead, - ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, - ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, - size_t hostRowPitch, size_t hostSlicePitch, void *pDst, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - ur_result_t Result = UR_RESULT_SUCCESS; - CUdeviceptr DevPtr = hBuffer->Mem.BufferMem.get(); - std::unique_ptr RetImplEvent{nullptr}; - - try { - ScopedContext Active(hQueue->getContext()); - CUstream CuStream = hQueue->getNextTransferStream(); - - Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, - phEventWaitList); - - if (phEvent) { - RetImplEvent = - std::unique_ptr(ur_event_handle_t_::makeNative( - UR_COMMAND_MEM_BUFFER_READ_RECT, hQueue, CuStream)); - UR_CHECK_ERROR(RetImplEvent->start()); - } - - Result = commonEnqueueMemBufferCopyRect( - CuStream, region, &DevPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin, - bufferRowPitch, bufferSlicePitch, pDst, CU_MEMORYTYPE_HOST, hostOrigin, - hostRowPitch, hostSlicePitch); - - if (phEvent) { - UR_CHECK_ERROR(RetImplEvent->record()); - } - - if (blockingRead) { - UR_CHECK_ERROR(cuStreamSynchronize(CuStream)); - } - - if (phEvent) { - *phEvent = RetImplEvent.release(); - } - - } catch (ur_result_t Err) { - Result = Err; - } - return Result; -} - -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( - ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingWrite, - ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, - ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, - size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - ur_result_t Result = UR_RESULT_SUCCESS; - CUdeviceptr DevPtr = hBuffer->Mem.BufferMem.get(); - std::unique_ptr RetImplEvent{nullptr}; - - try { - ScopedContext active(hQueue->getContext()); - CUstream cuStream = hQueue->getNextTransferStream(); - Result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, - phEventWaitList); - - if (phEvent) { - RetImplEvent = - std::unique_ptr(ur_event_handle_t_::makeNative( - UR_COMMAND_MEM_BUFFER_WRITE_RECT, hQueue, cuStream)); - UR_CHECK_ERROR(RetImplEvent->start()); - } - - Result = commonEnqueueMemBufferCopyRect( - cuStream, region, pSrc, CU_MEMORYTYPE_HOST, hostOrigin, hostRowPitch, - hostSlicePitch, &DevPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin, - bufferRowPitch, bufferSlicePitch); - - if (phEvent) { - UR_CHECK_ERROR(RetImplEvent->record()); - } - - if (blockingWrite) { - UR_CHECK_ERROR(cuStreamSynchronize(cuStream)); - } - - if (phEvent) { - *phEvent = RetImplEvent.release(); - } - - } catch (ur_result_t Err) { - Result = Err; - } - return Result; -} - -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy( - ur_queue_handle_t hQueue, ur_mem_handle_t hBufferSrc, - ur_mem_handle_t hBufferDst, size_t srcOffset, size_t dstOffset, size_t size, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - UR_ASSERT(size + dstOffset <= hBufferDst->Mem.BufferMem.getSize(), - UR_RESULT_ERROR_INVALID_SIZE); - UR_ASSERT(size + srcOffset <= hBufferSrc->Mem.BufferMem.getSize(), - UR_RESULT_ERROR_INVALID_SIZE); - - std::unique_ptr RetImplEvent{nullptr}; - - try { - ScopedContext Active(hQueue->getContext()); - ur_result_t Result = UR_RESULT_SUCCESS; - - auto Stream = hQueue->getNextTransferStream(); - Result = - enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList); - - if (phEvent) { - RetImplEvent = - std::unique_ptr(ur_event_handle_t_::makeNative( - UR_COMMAND_MEM_BUFFER_COPY, hQueue, Stream)); - UR_CHECK_ERROR(RetImplEvent->start()); - } - - auto Src = hBufferSrc->Mem.BufferMem.get() + srcOffset; - auto Dst = hBufferDst->Mem.BufferMem.get() + dstOffset; - - UR_CHECK_ERROR(cuMemcpyDtoDAsync(Dst, Src, size, Stream)); - - if (phEvent) { - UR_CHECK_ERROR(RetImplEvent->record()); - *phEvent = RetImplEvent.release(); - } - - return Result; - } catch (ur_result_t Err) { - return Err; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } -} - -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( - ur_queue_handle_t hQueue, ur_mem_handle_t hBufferSrc, - ur_mem_handle_t hBufferDst, ur_rect_offset_t srcOrigin, - ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, - size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - ur_result_t Result = UR_RESULT_SUCCESS; - CUdeviceptr SrcPtr = hBufferSrc->Mem.BufferMem.get(); - CUdeviceptr DstPtr = hBufferDst->Mem.BufferMem.get(); - std::unique_ptr RetImplEvent{nullptr}; - - try { - ScopedContext Active(hQueue->getContext()); - CUstream CuStream = hQueue->getNextTransferStream(); - Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, - phEventWaitList); - - if (phEvent) { - RetImplEvent = - std::unique_ptr(ur_event_handle_t_::makeNative( - UR_COMMAND_MEM_BUFFER_COPY_RECT, hQueue, CuStream)); - UR_CHECK_ERROR(RetImplEvent->start()); - } - - Result = commonEnqueueMemBufferCopyRect( - CuStream, region, &SrcPtr, CU_MEMORYTYPE_DEVICE, srcOrigin, srcRowPitch, - srcSlicePitch, &DstPtr, CU_MEMORYTYPE_DEVICE, dstOrigin, dstRowPitch, - dstSlicePitch); - - if (phEvent) { - UR_CHECK_ERROR(RetImplEvent->record()); - *phEvent = RetImplEvent.release(); - } - - } catch (ur_result_t err) { - Result = err; - } - return Result; -} - -// CUDA has no memset functions that allow setting values more than 4 bytes. UR -// API lets you pass an arbitrary "pattern" to the buffer fill, which can be -// more than 4 bytes. We must break up the pattern into 1 byte values, and set -// the buffer using multiple strided calls. The first 4 patterns are set using -// cuMemsetD32Async then all subsequent 1 byte patterns are set using -// cuMemset2DAsync which is called for each pattern. -ur_result_t commonMemSetLargePattern(CUstream Stream, uint32_t PatternSize, - size_t Size, const void *pPattern, - CUdeviceptr Ptr) { - // Calculate the number of patterns, stride, number of times the pattern - // needs to be applied, and the number of times the first 32 bit pattern - // needs to be applied. - auto NumberOfSteps = PatternSize / sizeof(uint8_t); - auto Pitch = NumberOfSteps * sizeof(uint8_t); - auto Height = Size / NumberOfSteps; - auto Count32 = Size / sizeof(uint32_t); - - // Get 4-byte chunk of the pattern and call cuMemsetD32Async - auto Value = *(static_cast(pPattern)); - UR_CHECK_ERROR(cuMemsetD32Async(Ptr, Value, Count32, Stream)); - for (auto step = 4u; step < NumberOfSteps; ++step) { - // take 1 byte of the pattern - Value = *(static_cast(pPattern) + step); - - // offset the pointer to the part of the buffer we want to write to - auto OffsetPtr = Ptr + (step * sizeof(uint8_t)); - - // set all of the pattern chunks - UR_CHECK_ERROR(cuMemsetD2D8Async(OffsetPtr, Pitch, Value, sizeof(uint8_t), - Height, Stream)); - } - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( - ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, const void *pPattern, - size_t patternSize, size_t offset, size_t size, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - UR_ASSERT(size + offset <= hBuffer->Mem.BufferMem.getSize(), - UR_RESULT_ERROR_INVALID_SIZE); - - std::unique_ptr RetImplEvent{nullptr}; - - try { - ScopedContext Active(hQueue->getContext()); - - auto Stream = hQueue->getNextTransferStream(); - ur_result_t Result = - enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList); - - if (phEvent) { - RetImplEvent = - std::unique_ptr(ur_event_handle_t_::makeNative( - UR_COMMAND_MEM_BUFFER_FILL, hQueue, Stream)); - UR_CHECK_ERROR(RetImplEvent->start()); - } - - auto DstDevice = hBuffer->Mem.BufferMem.get() + offset; - auto N = size / patternSize; - - // pattern size in bytes - switch (patternSize) { - case 1: { - auto Value = *static_cast(pPattern); - UR_CHECK_ERROR(cuMemsetD8Async(DstDevice, Value, N, Stream)); - break; - } - case 2: { - auto Value = *static_cast(pPattern); - UR_CHECK_ERROR(cuMemsetD16Async(DstDevice, Value, N, Stream)); - break; - } - case 4: { - auto Value = *static_cast(pPattern); - UR_CHECK_ERROR(cuMemsetD32Async(DstDevice, Value, N, Stream)); - break; - } - default: { - Result = commonMemSetLargePattern(Stream, patternSize, size, pPattern, - DstDevice); - break; - } - } - - if (phEvent) { - UR_CHECK_ERROR(RetImplEvent->record()); - *phEvent = RetImplEvent.release(); - } - - return Result; - } catch (ur_result_t Err) { - return Err; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } -} - -static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR ArrayDesc) { - switch (ArrayDesc.Format) { - case CU_AD_FORMAT_UNSIGNED_INT8: - case CU_AD_FORMAT_SIGNED_INT8: - return 1; - case CU_AD_FORMAT_UNSIGNED_INT16: - case CU_AD_FORMAT_SIGNED_INT16: - case CU_AD_FORMAT_HALF: - return 2; - case CU_AD_FORMAT_UNSIGNED_INT32: - case CU_AD_FORMAT_SIGNED_INT32: - case CU_AD_FORMAT_FLOAT: - return 4; - default: - detail::ur::die("Invalid image format."); - return 0; - } -} - -/// General ND memory copy operation for images (where N > 1). -/// This function requires the corresponding CUDA context to be at the top of -/// the context stack -/// If the source and/or destination is an array, SrcPtr and/or DstPtr -/// must be a pointer to a CUarray -static ur_result_t commonEnqueueMemImageNDCopy( - CUstream CuStream, ur_mem_type_t ImgType, const ur_rect_region_t Region, - const void *SrcPtr, const CUmemorytype_enum SrcType, - const ur_rect_offset_t SrcOffset, void *DstPtr, - const CUmemorytype_enum DstType, const ur_rect_offset_t DstOffset) { - UR_ASSERT(SrcType == CU_MEMORYTYPE_ARRAY || SrcType == CU_MEMORYTYPE_HOST, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(DstType == CU_MEMORYTYPE_ARRAY || DstType == CU_MEMORYTYPE_HOST, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); - - if (ImgType == UR_MEM_TYPE_IMAGE2D) { - CUDA_MEMCPY2D CpyDesc; - memset(&CpyDesc, 0, sizeof(CpyDesc)); - CpyDesc.srcMemoryType = SrcType; - if (SrcType == CU_MEMORYTYPE_ARRAY) { - CpyDesc.srcArray = *static_cast(SrcPtr); - CpyDesc.srcXInBytes = SrcOffset.x; - CpyDesc.srcY = SrcOffset.y; - } else { - CpyDesc.srcHost = SrcPtr; - } - CpyDesc.dstMemoryType = DstType; - if (DstType == CU_MEMORYTYPE_ARRAY) { - CpyDesc.dstArray = *static_cast(DstPtr); - CpyDesc.dstXInBytes = DstOffset.x; - CpyDesc.dstY = DstOffset.y; - } else { - CpyDesc.dstHost = DstPtr; - } - CpyDesc.WidthInBytes = Region.width; - CpyDesc.Height = Region.height; - UR_CHECK_ERROR(cuMemcpy2DAsync(&CpyDesc, CuStream)); - return UR_RESULT_SUCCESS; - } - if (ImgType == UR_MEM_TYPE_IMAGE3D) { - CUDA_MEMCPY3D CpyDesc; - memset(&CpyDesc, 0, sizeof(CpyDesc)); - CpyDesc.srcMemoryType = SrcType; - if (SrcType == CU_MEMORYTYPE_ARRAY) { - CpyDesc.srcArray = *static_cast(SrcPtr); - CpyDesc.srcXInBytes = SrcOffset.x; - CpyDesc.srcY = SrcOffset.y; - CpyDesc.srcZ = SrcOffset.z; - } else { - CpyDesc.srcHost = SrcPtr; - } - CpyDesc.dstMemoryType = DstType; - if (DstType == CU_MEMORYTYPE_ARRAY) { - CpyDesc.dstArray = *static_cast(DstPtr); - CpyDesc.dstXInBytes = DstOffset.x; - CpyDesc.dstY = DstOffset.y; - CpyDesc.dstZ = DstOffset.z; - } else { - CpyDesc.dstHost = DstPtr; - } - CpyDesc.WidthInBytes = Region.width; - CpyDesc.Height = Region.height; - CpyDesc.Depth = Region.depth; - UR_CHECK_ERROR(cuMemcpy3DAsync(&CpyDesc, CuStream)); - return UR_RESULT_SUCCESS; - } - return UR_RESULT_ERROR_INVALID_VALUE; -} - -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( - ur_queue_handle_t hQueue, ur_mem_handle_t hImage, bool blockingRead, - ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch, - size_t slicePitch, void *pDst, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - std::ignore = rowPitch; - std::ignore = slicePitch; - - UR_ASSERT(hImage->MemType == ur_mem_handle_t_::Type::Surface, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); - - ur_result_t Result = UR_RESULT_SUCCESS; - - try { - ScopedContext Active(hQueue->getContext()); - CUstream CuStream = hQueue->getNextTransferStream(); - Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, - phEventWaitList); - - CUarray Array = hImage->Mem.SurfaceMem.getArray(); - - CUDA_ARRAY_DESCRIPTOR ArrayDesc; - UR_CHECK_ERROR(cuArrayGetDescriptor(&ArrayDesc, Array)); - - int ElementByteSize = imageElementByteSize(ArrayDesc); - - size_t ByteOffsetX = origin.x * ElementByteSize * ArrayDesc.NumChannels; - size_t BytesToCopy = ElementByteSize * ArrayDesc.NumChannels * region.width; - - ur_mem_type_t ImgType = hImage->Mem.SurfaceMem.getImageType(); - - std::unique_ptr RetImplEvent{nullptr}; - if (phEvent) { - RetImplEvent = - std::unique_ptr(ur_event_handle_t_::makeNative( - UR_COMMAND_MEM_IMAGE_READ, hQueue, CuStream)); - UR_CHECK_ERROR(RetImplEvent->start()); - } - if (ImgType == UR_MEM_TYPE_IMAGE1D) { - UR_CHECK_ERROR( - cuMemcpyAtoHAsync(pDst, Array, ByteOffsetX, BytesToCopy, CuStream)); - } else { - ur_rect_region_t AdjustedRegion = {BytesToCopy, region.height, - region.depth}; - ur_rect_offset_t SrcOffset = {ByteOffsetX, origin.y, origin.z}; - - Result = commonEnqueueMemImageNDCopy( - CuStream, ImgType, AdjustedRegion, &Array, CU_MEMORYTYPE_ARRAY, - SrcOffset, pDst, CU_MEMORYTYPE_HOST, ur_rect_offset_t{}); - if (Result != UR_RESULT_SUCCESS) { - return Result; - } - } - - if (phEvent) { - UR_CHECK_ERROR(RetImplEvent->record()); - *phEvent = RetImplEvent.release(); - } - - if (blockingRead) { - UR_CHECK_ERROR(cuStreamSynchronize(CuStream)); - } - } catch (ur_result_t Err) { - return Err; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } - - return Result; -} - -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite( - ur_queue_handle_t hQueue, ur_mem_handle_t hImage, bool blockingWrite, - ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch, - size_t slicePitch, void *pSrc, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - std::ignore = blockingWrite; - std::ignore = rowPitch; - std::ignore = slicePitch; - - UR_ASSERT(hImage->MemType == ur_mem_handle_t_::Type::Surface, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); - - ur_result_t Result = UR_RESULT_SUCCESS; - - try { - ScopedContext Active(hQueue->getContext()); - CUstream CuStream = hQueue->getNextTransferStream(); - Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, - phEventWaitList); - - CUarray Array = hImage->Mem.SurfaceMem.getArray(); - - CUDA_ARRAY_DESCRIPTOR ArrayDesc; - UR_CHECK_ERROR(cuArrayGetDescriptor(&ArrayDesc, Array)); - - int ElementByteSize = imageElementByteSize(ArrayDesc); - - size_t ByteOffsetX = origin.x * ElementByteSize * ArrayDesc.NumChannels; - size_t BytesToCopy = ElementByteSize * ArrayDesc.NumChannels * region.width; - - std::unique_ptr RetImplEvent{nullptr}; - if (phEvent) { - RetImplEvent = - std::unique_ptr(ur_event_handle_t_::makeNative( - UR_COMMAND_MEM_IMAGE_WRITE, hQueue, CuStream)); - UR_CHECK_ERROR(RetImplEvent->start()); - } - - ur_mem_type_t ImgType = hImage->Mem.SurfaceMem.getImageType(); - if (ImgType == UR_MEM_TYPE_IMAGE1D) { - UR_CHECK_ERROR( - cuMemcpyHtoAAsync(Array, ByteOffsetX, pSrc, BytesToCopy, CuStream)); - } else { - ur_rect_region_t AdjustedRegion = {BytesToCopy, region.height, - region.depth}; - ur_rect_offset_t DstOffset = {ByteOffsetX, origin.y, origin.z}; - - Result = commonEnqueueMemImageNDCopy( - CuStream, ImgType, AdjustedRegion, pSrc, CU_MEMORYTYPE_HOST, - ur_rect_offset_t{}, &Array, CU_MEMORYTYPE_ARRAY, DstOffset); - - if (Result != UR_RESULT_SUCCESS) { - return Result; - } - } - - if (phEvent) { - UR_CHECK_ERROR(RetImplEvent->record()); - *phEvent = RetImplEvent.release(); - } - } catch (ur_result_t Err) { - return Err; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } - - return Result; -} - -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy( - ur_queue_handle_t hQueue, ur_mem_handle_t hImageSrc, - ur_mem_handle_t hImageDst, ur_rect_offset_t srcOrigin, - ur_rect_offset_t dstOrigin, ur_rect_region_t region, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - UR_ASSERT(hImageSrc->MemType == ur_mem_handle_t_::Type::Surface, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(hImageDst->MemType == ur_mem_handle_t_::Type::Surface, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(hImageSrc->Mem.SurfaceMem.getImageType() == - hImageDst->Mem.SurfaceMem.getImageType(), - UR_RESULT_ERROR_INVALID_MEM_OBJECT); - - ur_result_t Result = UR_RESULT_SUCCESS; - - try { - ScopedContext Active(hQueue->getContext()); - CUstream CuStream = hQueue->getNextTransferStream(); - Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, - phEventWaitList); - - CUarray SrcArray = hImageSrc->Mem.SurfaceMem.getArray(); - CUarray DstArray = hImageDst->Mem.SurfaceMem.getArray(); - - CUDA_ARRAY_DESCRIPTOR SrcArrayDesc; - UR_CHECK_ERROR(cuArrayGetDescriptor(&SrcArrayDesc, SrcArray)); - CUDA_ARRAY_DESCRIPTOR DstArrayDesc; - UR_CHECK_ERROR(cuArrayGetDescriptor(&DstArrayDesc, DstArray)); - - UR_ASSERT(SrcArrayDesc.Format == DstArrayDesc.Format, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(SrcArrayDesc.NumChannels == DstArrayDesc.NumChannels, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); - - int ElementByteSize = imageElementByteSize(SrcArrayDesc); - - size_t DstByteOffsetX = - dstOrigin.x * ElementByteSize * SrcArrayDesc.NumChannels; - size_t SrcByteOffsetX = - srcOrigin.x * ElementByteSize * DstArrayDesc.NumChannels; - size_t BytesToCopy = - ElementByteSize * SrcArrayDesc.NumChannels * region.width; - - std::unique_ptr RetImplEvent{nullptr}; - if (phEvent) { - RetImplEvent = - std::unique_ptr(ur_event_handle_t_::makeNative( - UR_COMMAND_MEM_IMAGE_COPY, hQueue, CuStream)); - UR_CHECK_ERROR(RetImplEvent->start()); - } - - ur_mem_type_t ImgType = hImageSrc->Mem.SurfaceMem.getImageType(); - if (ImgType == UR_MEM_TYPE_IMAGE1D) { - UR_CHECK_ERROR(cuMemcpyAtoA(DstArray, DstByteOffsetX, SrcArray, - SrcByteOffsetX, BytesToCopy)); - } else { - ur_rect_region_t AdjustedRegion = {BytesToCopy, region.height, - region.depth}; - ur_rect_offset_t SrcOffset = {SrcByteOffsetX, srcOrigin.y, srcOrigin.z}; - ur_rect_offset_t DstOffset = {DstByteOffsetX, dstOrigin.y, dstOrigin.z}; - - Result = commonEnqueueMemImageNDCopy( - CuStream, ImgType, AdjustedRegion, &SrcArray, CU_MEMORYTYPE_ARRAY, - SrcOffset, &DstArray, CU_MEMORYTYPE_ARRAY, DstOffset); - if (Result != UR_RESULT_SUCCESS) { - return Result; - } - } - - if (phEvent) { - UR_CHECK_ERROR(RetImplEvent->record()); - *phEvent = RetImplEvent.release(); - } - } catch (ur_result_t Err) { - return Err; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } - - return Result; -} - -/// Implements mapping on the host using a BufferRead operation. -/// Mapped pointers are stored in the pi_mem object. -/// If the buffer uses pinned host memory a pointer to that memory is returned -/// and no read operation is done. -/// -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( - ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingMap, - ur_map_flags_t mapFlags, size_t offset, size_t size, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent, void **ppRetMap) { - UR_ASSERT(hBuffer->MemType == ur_mem_handle_t_::Type::Buffer, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(offset + size <= hBuffer->Mem.BufferMem.getSize(), - UR_RESULT_ERROR_INVALID_SIZE); - - ur_result_t Result = UR_RESULT_ERROR_INVALID_MEM_OBJECT; - const bool IsPinned = - hBuffer->Mem.BufferMem.MemAllocMode == - ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr; - - // Currently no support for overlapping regions - if (hBuffer->Mem.BufferMem.getMapPtr() != nullptr) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - - // Allocate a pointer in the host to store the mapped information - auto HostPtr = hBuffer->Mem.BufferMem.mapToPtr(size, offset, mapFlags); - *ppRetMap = hBuffer->Mem.BufferMem.getMapPtr(); - if (HostPtr) { - Result = UR_RESULT_SUCCESS; - } - - if (!IsPinned && - ((mapFlags & UR_MAP_FLAG_READ) || (mapFlags & UR_MAP_FLAG_WRITE))) { - // Pinned host memory is already on host so it doesn't need to be read. - Result = urEnqueueMemBufferRead(hQueue, hBuffer, blockingMap, offset, size, - HostPtr, numEventsInWaitList, - phEventWaitList, phEvent); - } else { - ScopedContext Active(hQueue->getContext()); - - if (IsPinned) { - Result = urEnqueueEventsWait(hQueue, numEventsInWaitList, phEventWaitList, - nullptr); - } - - if (phEvent) { - try { - *phEvent = ur_event_handle_t_::makeNative( - UR_COMMAND_MEM_BUFFER_MAP, hQueue, hQueue->getNextTransferStream()); - UR_CHECK_ERROR((*phEvent)->start()); - UR_CHECK_ERROR((*phEvent)->record()); - } catch (ur_result_t Err) { - Result = Err; - } - } - } - - return Result; -} - -/// Implements the unmap from the host, using a BufferWrite operation. -/// Requires the mapped pointer to be already registered in the given memobj. -/// If memobj uses pinned host memory, this will not do a write. -/// -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( - ur_queue_handle_t hQueue, ur_mem_handle_t hMem, void *pMappedPtr, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - ur_result_t Result = UR_RESULT_SUCCESS; - UR_ASSERT(hMem->MemType == ur_mem_handle_t_::Type::Buffer, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(hMem->Mem.BufferMem.getMapPtr() != nullptr, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(hMem->Mem.BufferMem.getMapPtr() == pMappedPtr, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); - - const bool IsPinned = - hMem->Mem.BufferMem.MemAllocMode == - ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr; - - if (!IsPinned && (hMem->Mem.BufferMem.getMapFlags() & UR_MAP_FLAG_WRITE)) { - // Pinned host memory is only on host so it doesn't need to be written to. - Result = urEnqueueMemBufferWrite( - hQueue, hMem, true, hMem->Mem.BufferMem.getMapOffset(), - hMem->Mem.BufferMem.getMapSize(), pMappedPtr, numEventsInWaitList, - phEventWaitList, phEvent); - } else { - ScopedContext Active(hQueue->getContext()); - - if (IsPinned) { - Result = urEnqueueEventsWait(hQueue, numEventsInWaitList, phEventWaitList, - nullptr); - } - - if (phEvent) { - try { - *phEvent = ur_event_handle_t_::makeNative( - UR_COMMAND_MEM_UNMAP, hQueue, hQueue->getNextTransferStream()); - UR_CHECK_ERROR((*phEvent)->start()); - UR_CHECK_ERROR((*phEvent)->record()); - } catch (ur_result_t Err) { - Result = Err; - } - } - } - - hMem->Mem.BufferMem.unmap(pMappedPtr); - return Result; -} - -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( - ur_queue_handle_t hQueue, void *ptr, size_t patternSize, - const void *pPattern, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - ur_result_t Result = UR_RESULT_SUCCESS; - std::unique_ptr EventPtr{nullptr}; - - try { - ScopedContext Active(hQueue->getContext()); - uint32_t StreamToken; - ur_stream_guard_ Guard; - CUstream CuStream = hQueue->getNextComputeStream( - numEventsInWaitList, phEventWaitList, Guard, &StreamToken); - UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, - phEventWaitList)); - if (phEvent) { - EventPtr = - std::unique_ptr(ur_event_handle_t_::makeNative( - UR_COMMAND_USM_FILL, hQueue, CuStream, StreamToken)); - UR_CHECK_ERROR(EventPtr->start()); - } - - auto N = size / patternSize; - switch (patternSize) { - case 1: - UR_CHECK_ERROR(cuMemsetD8Async( - (CUdeviceptr)ptr, *((const uint8_t *)pPattern) & 0xFF, N, CuStream)); - break; - case 2: - UR_CHECK_ERROR(cuMemsetD16Async((CUdeviceptr)ptr, - *((const uint16_t *)pPattern) & 0xFFFF, N, - CuStream)); - break; - case 4: - UR_CHECK_ERROR(cuMemsetD32Async( - (CUdeviceptr)ptr, *((const uint32_t *)pPattern) & 0xFFFFFFFF, N, - CuStream)); - break; - default: - commonMemSetLargePattern(CuStream, patternSize, size, pPattern, - (CUdeviceptr)ptr); - break; - } - if (phEvent) { - UR_CHECK_ERROR(EventPtr->record()); - *phEvent = EventPtr.release(); - } - } catch (ur_result_t Err) { - Result = Err; - } - return Result; -} - -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy( - ur_queue_handle_t hQueue, bool blocking, void *pDst, const void *pSrc, - size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - ur_result_t Result = UR_RESULT_SUCCESS; - - std::unique_ptr EventPtr{nullptr}; - - try { - ScopedContext Active(hQueue->getContext()); - CUstream CuStream = hQueue->getNextTransferStream(); - Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, - phEventWaitList); - if (phEvent) { - EventPtr = - std::unique_ptr(ur_event_handle_t_::makeNative( - UR_COMMAND_USM_MEMCPY, hQueue, CuStream)); - UR_CHECK_ERROR(EventPtr->start()); - } - UR_CHECK_ERROR( - cuMemcpyAsync((CUdeviceptr)pDst, (CUdeviceptr)pSrc, size, CuStream)); - if (phEvent) { - UR_CHECK_ERROR(EventPtr->record()); - } - if (blocking) { - UR_CHECK_ERROR(cuStreamSynchronize(CuStream)); - } - if (phEvent) { - *phEvent = EventPtr.release(); - } - } catch (ur_result_t Err) { - Result = Err; - } - return Result; -} - -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( - ur_queue_handle_t hQueue, const void *pMem, size_t size, - ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - unsigned int PointerRangeSize = 0; - UR_CHECK_ERROR(cuPointerGetAttribute( - &PointerRangeSize, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem)); - UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE); - ur_device_handle_t Device = hQueue->getContext()->getDevice(); - - // Certain cuda devices and Windows do not have support for some Unified - // Memory features. cuMemPrefetchAsync requires concurrent memory access - // for managed memory. Therfore, ignore prefetch hint if concurrent managed - // memory access is not available. - if (!getAttribute(Device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { - setErrorMessage("Prefetch hint ignored as device does not support " - "concurrent managed access", - UR_RESULT_SUCCESS); - return UR_RESULT_ERROR_ADAPTER_SPECIFIC; - } - - unsigned int IsManaged; - UR_CHECK_ERROR(cuPointerGetAttribute( - &IsManaged, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem)); - if (!IsManaged) { - setErrorMessage("Prefetch hint ignored as prefetch only works with USM", - UR_RESULT_SUCCESS); - return UR_RESULT_ERROR_ADAPTER_SPECIFIC; - } - - // flags is currently unused so fail if set - if (flags != 0) - return UR_RESULT_ERROR_INVALID_VALUE; - - ur_result_t Result = UR_RESULT_SUCCESS; - std::unique_ptr EventPtr{nullptr}; - - try { - ScopedContext Active(hQueue->getContext()); - CUstream CuStream = hQueue->getNextTransferStream(); - Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, - phEventWaitList); - if (phEvent) { - EventPtr = - std::unique_ptr(ur_event_handle_t_::makeNative( - UR_COMMAND_MEM_BUFFER_COPY, hQueue, CuStream)); - UR_CHECK_ERROR(EventPtr->start()); - } - UR_CHECK_ERROR( - cuMemPrefetchAsync((CUdeviceptr)pMem, size, Device->get(), CuStream)); - if (phEvent) { - UR_CHECK_ERROR(EventPtr->record()); - *phEvent = EventPtr.release(); - } - } catch (ur_result_t Err) { - Result = Err; - } - return Result; -} - -/// USM: memadvise API to govern behavior of automatic migration mechanisms -UR_APIEXPORT ur_result_t UR_APICALL -urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, - ur_usm_advice_flags_t advice, ur_event_handle_t *phEvent) { - unsigned int PointerRangeSize = 0; - UR_CHECK_ERROR(cuPointerGetAttribute( - &PointerRangeSize, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem)); - UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE); - - // Certain cuda devices and Windows do not have support for some Unified - // Memory features. Passing CU_MEM_ADVISE_SET/CLEAR_PREFERRED_LOCATION and - // to cuMemAdvise on a GPU device requires the GPU device to report a non-zero - // value for CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Therfore, ignore - // memory advise if concurrent managed memory access is not available. - if ((advice & UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION) || - (advice & UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION) || - (advice & UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE) || - (advice & UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_DEVICE) || - (advice & UR_USM_ADVICE_FLAG_DEFAULT)) { - ur_device_handle_t Device = hQueue->getContext()->getDevice(); - if (!getAttribute(Device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { - setErrorMessage("Mem advise ignored as device does not support " - "concurrent managed access", - UR_RESULT_SUCCESS); - return UR_RESULT_ERROR_ADAPTER_SPECIFIC; - } - - // TODO: If ptr points to valid system-allocated pageable memory we should - // check that the device also has the - // CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS property. - } - - unsigned int IsManaged; - UR_CHECK_ERROR(cuPointerGetAttribute( - &IsManaged, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem)); - if (!IsManaged) { - setErrorMessage( - "Memory advice ignored as memory advices only works with USM", - UR_RESULT_SUCCESS); - return UR_RESULT_ERROR_ADAPTER_SPECIFIC; - } - - ur_result_t Result = UR_RESULT_SUCCESS; - std::unique_ptr EventPtr{nullptr}; - - try { - ScopedContext Active(hQueue->getContext()); - - if (phEvent) { - EventPtr = - std::unique_ptr(ur_event_handle_t_::makeNative( - UR_COMMAND_USM_ADVISE, hQueue, hQueue->getNextTransferStream())); - UR_CHECK_ERROR(EventPtr->start()); - } - - if (advice & UR_USM_ADVICE_FLAG_DEFAULT) { - UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size, - CU_MEM_ADVISE_UNSET_READ_MOSTLY, - hQueue->getContext()->getDevice()->get())); - UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size, - CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION, - hQueue->getContext()->getDevice()->get())); - UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size, - CU_MEM_ADVISE_UNSET_ACCESSED_BY, - hQueue->getContext()->getDevice()->get())); - } else { - Result = setCuMemAdvise((CUdeviceptr)pMem, size, advice, - hQueue->getContext()->getDevice()->get()); - } - - if (phEvent) { - UR_CHECK_ERROR(EventPtr->record()); - *phEvent = EventPtr.release(); - } - } catch (ur_result_t err) { - Result = err; - } catch (...) { - Result = UR_RESULT_ERROR_UNKNOWN; - } - return Result; -} - -// TODO: Implement this. Remember to return true for -// PI_EXT_ONEAPI_CONTEXT_INFO_USM_FILL2D_SUPPORT when it is implemented. -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill2D( - ur_queue_handle_t, void *, size_t, size_t, const void *, size_t, size_t, - uint32_t, const ur_event_handle_t *, ur_event_handle_t *) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( - ur_queue_handle_t hQueue, bool blocking, void *pDst, size_t dstPitch, - const void *pSrc, size_t srcPitch, size_t width, size_t height, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - ur_result_t result = UR_RESULT_SUCCESS; - - try { - ScopedContext active(hQueue->getContext()); - CUstream cuStream = hQueue->getNextTransferStream(); - result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, - phEventWaitList); - - std::unique_ptr RetImplEvent{nullptr}; - if (phEvent) { - RetImplEvent = - std::unique_ptr(ur_event_handle_t_::makeNative( - UR_COMMAND_MEM_BUFFER_COPY_RECT, hQueue, cuStream)); - UR_CHECK_ERROR(RetImplEvent->start()); - } - - // Determine the direction of copy using cuPointerGetAttribute - // for both the SrcPtr and DstPtr - CUDA_MEMCPY2D CpyDesc = {}; - memset(&CpyDesc, 0, sizeof(CpyDesc)); - - getUSMHostOrDevicePtr(pSrc, &CpyDesc.srcMemoryType, &CpyDesc.srcDevice, - &CpyDesc.srcHost); - getUSMHostOrDevicePtr(pDst, &CpyDesc.dstMemoryType, &CpyDesc.dstDevice, - &CpyDesc.dstHost); - - CpyDesc.dstPitch = dstPitch; - CpyDesc.srcPitch = srcPitch; - CpyDesc.WidthInBytes = width; - CpyDesc.Height = height; - - UR_CHECK_ERROR(cuMemcpy2DAsync(&CpyDesc, cuStream)); - - if (phEvent) { - UR_CHECK_ERROR(RetImplEvent->record()); - *phEvent = RetImplEvent.release(); - } - if (blocking) { - UR_CHECK_ERROR(cuStreamSynchronize(cuStream)); - } - } catch (ur_result_t err) { - result = err; - } - return result; -} - -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( - ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingRead, - size_t offset, size_t size, void *pDst, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - UR_ASSERT(!hBuffer->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(offset + size <= hBuffer->Mem.BufferMem.Size, - UR_RESULT_ERROR_INVALID_SIZE); - - ur_result_t Result = UR_RESULT_SUCCESS; - CUdeviceptr DevPtr = hBuffer->Mem.BufferMem.get(); - std::unique_ptr RetImplEvent{nullptr}; - - try { - ScopedContext Active(hQueue->getContext()); - CUstream CuStream = hQueue->getNextTransferStream(); - - Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, - phEventWaitList); - - if (phEvent) { - RetImplEvent = - std::unique_ptr(ur_event_handle_t_::makeNative( - UR_COMMAND_MEM_BUFFER_READ, hQueue, CuStream)); - UR_CHECK_ERROR(RetImplEvent->start()); - } - - UR_CHECK_ERROR(cuMemcpyDtoHAsync(pDst, DevPtr + offset, size, CuStream)); - - if (phEvent) { - UR_CHECK_ERROR(RetImplEvent->record()); - } - - if (blockingRead) { - UR_CHECK_ERROR(cuStreamSynchronize(CuStream)); - } - - if (phEvent) { - *phEvent = RetImplEvent.release(); - } - - } catch (ur_result_t Err) { - Result = Err; - } - - return Result; -} - -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( - ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingWrite, - size_t offset, size_t size, const void *pSrc, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - UR_ASSERT(!hBuffer->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(offset + size <= hBuffer->Mem.BufferMem.Size, - UR_RESULT_ERROR_INVALID_SIZE); - - ur_result_t Result = UR_RESULT_SUCCESS; - CUdeviceptr DevPtr = hBuffer->Mem.BufferMem.get(); - std::unique_ptr RetImplEvent{nullptr}; - - try { - ScopedContext Active(hQueue->getContext()); - CUstream CuStream = hQueue->getNextTransferStream(); - - Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, - phEventWaitList); - - if (phEvent) { - RetImplEvent = - std::unique_ptr(ur_event_handle_t_::makeNative( - UR_COMMAND_MEM_BUFFER_WRITE, hQueue, CuStream)); - UR_CHECK_ERROR(RetImplEvent->start()); - } - - UR_CHECK_ERROR(cuMemcpyHtoDAsync(DevPtr + offset, pSrc, size, CuStream)); - - if (phEvent) { - UR_CHECK_ERROR(RetImplEvent->record()); - } - - if (blockingWrite) { - UR_CHECK_ERROR(cuStreamSynchronize(CuStream)); - } - - if (phEvent) { - *phEvent = RetImplEvent.release(); - } - } catch (ur_result_t Err) { - Result = Err; - } - return Result; -} - -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite( - ur_queue_handle_t hQueue, ur_program_handle_t hProgram, const char *name, - bool blockingWrite, size_t count, size_t offset, const void *pSrc, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - // Since CUDA requires a the global variable to be referenced by name, we use - // metadata to find the correct name to access it by. - auto DeviceGlobalNameIt = hProgram->GlobalIDMD.find(name); - if (DeviceGlobalNameIt == hProgram->GlobalIDMD.end()) - return UR_RESULT_ERROR_INVALID_VALUE; - std::string DeviceGlobalName = DeviceGlobalNameIt->second; - - ur_result_t Result = UR_RESULT_SUCCESS; - try { - CUdeviceptr DeviceGlobal = 0; - size_t DeviceGlobalSize = 0; - UR_CHECK_ERROR(cuModuleGetGlobal(&DeviceGlobal, &DeviceGlobalSize, - hProgram->get(), - DeviceGlobalName.c_str())); - - if (offset + count > DeviceGlobalSize) - return UR_RESULT_ERROR_INVALID_VALUE; - - return urEnqueueUSMMemcpy( - hQueue, blockingWrite, reinterpret_cast(DeviceGlobal + offset), - pSrc, count, numEventsInWaitList, phEventWaitList, phEvent); - } catch (ur_result_t Err) { - Result = Err; - } - return Result; -} - -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead( - ur_queue_handle_t hQueue, ur_program_handle_t hProgram, const char *name, - bool blockingRead, size_t count, size_t offset, void *pDst, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - // Since CUDA requires a the global variable to be referenced by name, we use - // metadata to find the correct name to access it by. - auto DeviceGlobalNameIt = hProgram->GlobalIDMD.find(name); - if (DeviceGlobalNameIt == hProgram->GlobalIDMD.end()) - return UR_RESULT_ERROR_INVALID_VALUE; - std::string DeviceGlobalName = DeviceGlobalNameIt->second; - - ur_result_t Result = UR_RESULT_SUCCESS; - try { - CUdeviceptr DeviceGlobal = 0; - size_t DeviceGlobalSize = 0; - UR_CHECK_ERROR(cuModuleGetGlobal(&DeviceGlobal, &DeviceGlobalSize, - hProgram->get(), - DeviceGlobalName.c_str())); - - if (offset + count > DeviceGlobalSize) - return UR_RESULT_ERROR_INVALID_VALUE; - - return urEnqueueUSMMemcpy( - hQueue, blockingRead, pDst, - reinterpret_cast(DeviceGlobal + offset), count, - numEventsInWaitList, phEventWaitList, phEvent); - } catch (ur_result_t Err) { - Result = Err; - } - return Result; -} - -/// Host Pipes -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueReadHostPipe( - ur_queue_handle_t hQueue, ur_program_handle_t hProgram, - const char *pipe_symbol, bool blocking, void *pDst, size_t size, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - (void)hQueue; - (void)hProgram; - (void)pipe_symbol; - (void)blocking; - (void)pDst; - (void)size; - (void)numEventsInWaitList; - (void)phEventWaitList; - (void)phEvent; - - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueWriteHostPipe( - ur_queue_handle_t hQueue, ur_program_handle_t hProgram, - const char *pipe_symbol, bool blocking, void *pSrc, size_t size, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - (void)hQueue; - (void)hProgram; - (void)pipe_symbol; - (void)blocking; - (void)pSrc; - (void)size; - (void)numEventsInWaitList; - (void)phEventWaitList; - (void)phEvent; - - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.hpp deleted file mode 100644 index d49853b38dccb..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.hpp +++ /dev/null @@ -1,16 +0,0 @@ -//===--------- enqueue.hpp - CUDA Adapter ---------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#pragma once - -#include -#include -#include - -ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream, - uint32_t NumEventsInWaitList, - const ur_event_handle_t *EventWaitList); diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp deleted file mode 100644 index 18d861c4e9ee5..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp +++ /dev/null @@ -1,295 +0,0 @@ -//===--------- event.cpp - CUDA Adapter -----------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "event.hpp" -#include "common.hpp" -#include "context.hpp" -#include "device.hpp" -#include "queue.hpp" - -#include -#include - -ur_event_handle_t_::ur_event_handle_t_(ur_command_t Type, - ur_context_handle_t Context, - ur_queue_handle_t Queue, CUstream Stream, - uint32_t StreamToken) - : CommandType{Type}, RefCount{1}, HasOwnership{true}, - HasBeenWaitedOn{false}, IsRecorded{false}, IsStarted{false}, - StreamToken{StreamToken}, EvEnd{nullptr}, EvStart{nullptr}, - EvQueued{nullptr}, Queue{Queue}, Stream{Stream}, Context{Context} { - - bool ProfilingEnabled = Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE; - - UR_CHECK_ERROR(cuEventCreate( - &EvEnd, ProfilingEnabled ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING)); - - if (ProfilingEnabled) { - UR_CHECK_ERROR(cuEventCreate(&EvQueued, CU_EVENT_DEFAULT)); - UR_CHECK_ERROR(cuEventCreate(&EvStart, CU_EVENT_DEFAULT)); - } - - if (Queue != nullptr) { - urQueueRetain(Queue); - } - urContextRetain(Context); -} - -ur_event_handle_t_::ur_event_handle_t_(ur_context_handle_t Context, - CUevent EventNative) - : CommandType{UR_COMMAND_EVENTS_WAIT}, RefCount{1}, HasOwnership{false}, - HasBeenWaitedOn{false}, IsRecorded{false}, IsStarted{false}, - StreamToken{std::numeric_limits::max()}, EvEnd{EventNative}, - EvStart{nullptr}, EvQueued{nullptr}, Queue{nullptr}, Context{Context} { - urContextRetain(Context); -} - -ur_event_handle_t_::~ur_event_handle_t_() { - if (Queue != nullptr) { - urQueueRelease(Queue); - } - urContextRelease(Context); -} - -ur_result_t ur_event_handle_t_::start() { - assert(!isStarted()); - ur_result_t Result = UR_RESULT_SUCCESS; - - try { - if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) { - // NOTE: This relies on the default stream to be unused. - UR_CHECK_ERROR(cuEventRecord(EvQueued, 0)); - UR_CHECK_ERROR(cuEventRecord(EvStart, Stream)); - } - } catch (ur_result_t Err) { - Result = Err; - } - - IsStarted = true; - return Result; -} - -bool ur_event_handle_t_::isCompleted() const noexcept { - if (!IsRecorded) { - return false; - } - if (!HasBeenWaitedOn) { - const CUresult Result = cuEventQuery(EvEnd); - if (Result != CUDA_SUCCESS && Result != CUDA_ERROR_NOT_READY) { - UR_CHECK_ERROR(Result); - return false; - } - if (Result == CUDA_ERROR_NOT_READY) { - return false; - } - } - return true; -} - -uint64_t ur_event_handle_t_::getQueuedTime() const { - assert(isStarted()); - return Queue->get_device()->getElapsedTime(EvQueued); -} - -uint64_t ur_event_handle_t_::getStartTime() const { - assert(isStarted()); - return Queue->get_device()->getElapsedTime(EvStart); -} - -uint64_t ur_event_handle_t_::getEndTime() const { - assert(isStarted() && isRecorded()); - return Queue->get_device()->getElapsedTime(EvEnd); -} - -ur_result_t ur_event_handle_t_::record() { - - if (isRecorded() || !isStarted()) { - return UR_RESULT_ERROR_INVALID_EVENT; - } - - ur_result_t Result = UR_RESULT_SUCCESS; - - UR_ASSERT(Queue, UR_RESULT_ERROR_INVALID_QUEUE); - - try { - EventID = Queue->getNextEventID(); - if (EventID == 0) { - detail::ur::die( - "Unrecoverable program state reached in event identifier overflow"); - } - UR_CHECK_ERROR(cuEventRecord(EvEnd, Stream)); - } catch (ur_result_t error) { - Result = error; - } - - if (Result == UR_RESULT_SUCCESS) { - IsRecorded = true; - } - - return Result; -} - -ur_result_t ur_event_handle_t_::wait() { - ur_result_t Result = UR_RESULT_SUCCESS; - try { - UR_CHECK_ERROR(cuEventSynchronize(EvEnd)); - HasBeenWaitedOn = true; - } catch (ur_result_t error) { - Result = error; - } - - return Result; -} - -ur_result_t ur_event_handle_t_::release() { - if (!backendHasOwnership()) - return UR_RESULT_SUCCESS; - - assert(Queue != nullptr); - - UR_CHECK_ERROR(cuEventDestroy(EvEnd)); - - if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) { - UR_CHECK_ERROR(cuEventDestroy(EvQueued)); - UR_CHECK_ERROR(cuEventDestroy(EvStart)); - } - - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo(ur_event_handle_t hEvent, - ur_event_info_t propName, - size_t propValueSize, - void *pPropValue, - size_t *pPropValueSizeRet) { - UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet); - - switch (propName) { - case UR_EVENT_INFO_COMMAND_QUEUE: - return ReturnValue(hEvent->getQueue()); - case UR_EVENT_INFO_COMMAND_TYPE: - return ReturnValue(hEvent->getCommandType()); - case UR_EVENT_INFO_REFERENCE_COUNT: - return ReturnValue(hEvent->getReferenceCount()); - case UR_EVENT_INFO_COMMAND_EXECUTION_STATUS: - return ReturnValue(hEvent->getExecutionStatus()); - case UR_EVENT_INFO_CONTEXT: - return ReturnValue(hEvent->getContext()); - default: - detail::ur::die("Event info request not implemented"); - } - - return UR_RESULT_ERROR_INVALID_ENUMERATION; -} - -/// Obtain profiling information from PI CUDA events -/// \TODO Timings from CUDA are only elapsed time. -UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( - ur_event_handle_t hEvent, ur_profiling_info_t propName, - size_t propValueSize, void *pPropValue, size_t *pPropValueSizeRet) { - UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet); - - ur_queue_handle_t Queue = hEvent->getQueue(); - if (Queue == nullptr || !(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE)) { - return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE; - } - - switch (propName) { - case UR_PROFILING_INFO_COMMAND_QUEUED: - case UR_PROFILING_INFO_COMMAND_SUBMIT: - // Note: No user for this case - return ReturnValue(static_cast(hEvent->getQueuedTime())); - case UR_PROFILING_INFO_COMMAND_START: - return ReturnValue(static_cast(hEvent->getStartTime())); - case UR_PROFILING_INFO_COMMAND_END: - return ReturnValue(static_cast(hEvent->getEndTime())); - default: - break; - } - detail::ur::die("Event Profiling info request not implemented"); - return {}; -} - -UR_APIEXPORT ur_result_t UR_APICALL urEventSetCallback(ur_event_handle_t, - ur_execution_info_t, - ur_event_callback_t, - void *) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -UR_APIEXPORT ur_result_t UR_APICALL -urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) { - try { - auto Context = phEventWaitList[0]->getContext(); - ScopedContext Active(Context); - - auto WaitFunc = [Context](ur_event_handle_t Event) -> ur_result_t { - UR_ASSERT(Event, UR_RESULT_ERROR_INVALID_EVENT); - UR_ASSERT(Event->getContext() == Context, - UR_RESULT_ERROR_INVALID_CONTEXT); - - return Event->wait(); - }; - return forLatestEvents(phEventWaitList, numEvents, WaitFunc); - } catch (ur_result_t Err) { - return Err; - } catch (...) { - return UR_RESULT_ERROR_OUT_OF_RESOURCES; - } -} - -UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(ur_event_handle_t hEvent) { - const auto RefCount = hEvent->incrementReferenceCount(); - - detail::ur::assertion(RefCount != 0, - "Reference count overflow detected in urEventRetain."); - - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) { - // double delete or someone is messing with the ref count. - // either way, cannot safely proceed. - detail::ur::assertion(hEvent->getReferenceCount() != 0, - "Reference count overflow detected in urEventRelease."); - - // decrement ref count. If it is 0, delete the event. - if (hEvent->decrementReferenceCount() == 0) { - std::unique_ptr event_ptr{hEvent}; - ur_result_t Result = UR_RESULT_ERROR_INVALID_EVENT; - try { - ScopedContext Active(hEvent->getContext()); - Result = hEvent->release(); - } catch (...) { - Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; - } - return Result; - } - - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urEventGetNativeHandle( - ur_event_handle_t hEvent, ur_native_handle_t *phNativeEvent) { - *phNativeEvent = reinterpret_cast(hEvent->get()); - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle( - ur_native_handle_t hNativeEvent, ur_context_handle_t hContext, - const ur_event_native_properties_t *pProperties, - ur_event_handle_t *phEvent) { - std::ignore = pProperties; - - std::unique_ptr EventPtr{nullptr}; - - *phEvent = ur_event_handle_t_::makeWithNative( - hContext, reinterpret_cast(hNativeEvent)); - - return UR_RESULT_SUCCESS; -} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp deleted file mode 100644 index 4c788532c224e..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp +++ /dev/null @@ -1,189 +0,0 @@ -//===--------- event.hpp - CUDA Adapter -----------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#pragma once - -#include -#include - -#include "queue.hpp" - -/// UR Event mapping to CUevent -/// -struct ur_event_handle_t_ { -public: - using native_type = CUevent; - - ur_result_t record(); - - ur_result_t wait(); - - ur_result_t start(); - - native_type get() const noexcept { return EvEnd; }; - - ur_queue_handle_t getQueue() const noexcept { return Queue; } - - CUstream getStream() const noexcept { return Stream; } - - uint32_t getComputeStreamToken() const noexcept { return StreamToken; } - - ur_command_t getCommandType() const noexcept { return CommandType; } - - uint32_t getReferenceCount() const noexcept { return RefCount; } - - bool isRecorded() const noexcept { return IsRecorded; } - - bool isStarted() const noexcept { return IsStarted; } - - bool isCompleted() const noexcept; - - uint32_t getExecutionStatus() const noexcept { - - if (!isRecorded()) { - return UR_EVENT_STATUS_SUBMITTED; - } - - if (!isCompleted()) { - return UR_EVENT_STATUS_RUNNING; - } - return UR_EVENT_STATUS_COMPLETE; - } - - ur_context_handle_t getContext() const noexcept { return Context; }; - - uint32_t incrementReferenceCount() { return ++RefCount; } - - uint32_t decrementReferenceCount() { return --RefCount; } - - uint32_t getEventID() const noexcept { return EventID; } - - bool backendHasOwnership() const noexcept { return HasOwnership; } - - // Returns the counter time when the associated command(s) were enqueued - // - uint64_t getQueuedTime() const; - - // Returns the counter time when the associated command(s) started execution - // - uint64_t getStartTime() const; - - // Returns the counter time when the associated command(s) completed - // - uint64_t getEndTime() const; - - // construct a native CUDA. This maps closely to the underlying CUDA event. - static ur_event_handle_t - makeNative(ur_command_t Type, ur_queue_handle_t Queue, CUstream Stream, - uint32_t StreamToken = std::numeric_limits::max()) { - return new ur_event_handle_t_(Type, Queue->getContext(), Queue, Stream, - StreamToken); - } - - static ur_event_handle_t makeWithNative(ur_context_handle_t context, - CUevent eventNative) { - return new ur_event_handle_t_(context, eventNative); - } - - ur_result_t release(); - - ~ur_event_handle_t_(); - -private: - // This constructor is private to force programmers to use the makeNative / - // make_user static members in order to create a pi_event for CUDA. - ur_event_handle_t_(ur_command_t Type, ur_context_handle_t Context, - ur_queue_handle_t Queue, CUstream Stream, - uint32_t StreamToken); - - // This constructor is private to force programmers to use the - // makeWithNative for event interop - ur_event_handle_t_(ur_context_handle_t Context, CUevent EventNative); - - ur_command_t CommandType; // The type of command associated with event. - - std::atomic_uint32_t RefCount; // Event reference count. - - bool HasOwnership; // Signifies if event owns the native type. - - bool HasBeenWaitedOn; // Signifies whether the event has been waited - // on through a call to wait(), which implies - // that it has completed. - - bool IsRecorded; // Signifies wether a native CUDA event has been recorded - // yet. - bool IsStarted; // Signifies wether the operation associated with the - // UR event has started or not - - uint32_t StreamToken; - uint32_t EventID; // Queue identifier of the event. - - native_type EvEnd; // CUDA event handle. If this ur_event_handle_t represents - // a user event, this will be nullptr. - - native_type EvStart; // CUDA event handle associated with the start - - native_type EvQueued; // CUDA event handle associated with the time - // the command was enqueued - - ur_queue_handle_t Queue; // ur_queue_handle_t associated with the event. If - // this is a user event, this will be nullptr. - - CUstream Stream; // CUstream associated with the event. If this is a user - // event, this will be uninitialized. - - ur_context_handle_t Context; // ur_context_handle_t associated with the event. - // If this is a native event, this will be the - // same context associated with the queue member. -}; - -// Iterate over `event_wait_list` and apply the given callback `f` to the -// latest event on each queue therein. The callback must take a single -// ur_event_handle_t argument and return a ur_result_t. If the callback returns -// an error, the iteration terminates and the error is returned. -template -ur_result_t forLatestEvents(const ur_event_handle_t *EventWaitList, - std::size_t NumEventsInWaitList, Func &&F) { - - if (EventWaitList == nullptr || NumEventsInWaitList == 0) { - return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; - } - - // Fast path if we only have a single event - if (NumEventsInWaitList == 1) { - return F(EventWaitList[0]); - } - - std::vector Events{EventWaitList, - EventWaitList + NumEventsInWaitList}; - std::sort(Events.begin(), Events.end(), - [](ur_event_handle_t Event0, ur_event_handle_t Event1) { - // Tiered sort creating sublists of streams (smallest value first) - // in which the corresponding events are sorted into a sequence of - // newest first. - return Event0->getStream() < Event1->getStream() || - (Event0->getStream() == Event1->getStream() && - Event0->getEventID() > Event1->getEventID()); - }); - - CUstream LastSeenStream = 0; - for (size_t i = 0; i < Events.size(); i++) { - auto Event = Events[i]; - if (!Event || (i != 0 && Event->getStream() == LastSeenStream)) { - continue; - } - - LastSeenStream = Event->getStream(); - - auto Result = F(Event); - if (Result != UR_RESULT_SUCCESS) { - return Result; - } - } - - return UR_RESULT_SUCCESS; -} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/image.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/image.cpp deleted file mode 100644 index 1b11cade5cebc..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/image.cpp +++ /dev/null @@ -1,1061 +0,0 @@ -//===--------- image.cpp - CUDA Adapter -----------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include -#include -#include - -#include "common.hpp" -#include "context.hpp" -#include "enqueue.hpp" -#include "event.hpp" -#include "image.hpp" -#include "memory.hpp" -#include "queue.hpp" -#include "sampler.hpp" -#include "ur/ur.hpp" -#include "ur_api.h" - -ur_result_t urCalculateNumChannels(ur_image_channel_order_t order, - unsigned int *NumChannels) { - switch (order) { - case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_A: - case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_R: - *NumChannels = 1; - return UR_RESULT_SUCCESS; - case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_RG: - case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_RA: - *NumChannels = 2; - return UR_RESULT_SUCCESS; - case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_RGB: - return UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED; - case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_RGBA: - case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_ARGB: - case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_BGRA: - case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_ABGR: - *NumChannels = 4; - return UR_RESULT_SUCCESS; - case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_RX: - case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_RGX: - case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_RGBX: - case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_SRGBA: - case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_INTENSITY: - case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_LUMINANCE: - default: - return UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED; - } -} - -/// Convert a UR image format to a CUDA image format and -/// get the pixel size in bytes. -/// /param image_channel_type is the ur_image_channel_type_t. -/// /param image_channel_order is the ur_image_channel_order_t. -/// this is used for normalized channel formats, as CUDA -/// combines the channel format and order for normalized -/// channel types. -/// /param return_cuda_format will be set to the equivalent cuda -/// format if not nullptr. -/// /param return_pixel_size_bytes will be set to the pixel -/// byte size if not nullptr. -ur_result_t -urToCudaImageChannelFormat(ur_image_channel_type_t image_channel_type, - ur_image_channel_order_t image_channel_order, - CUarray_format *return_cuda_format, - size_t *return_pixel_size_bytes) { - - CUarray_format cuda_format; - size_t pixel_size_bytes = 0; - unsigned int num_channels = 0; - UR_CHECK_ERROR(urCalculateNumChannels(image_channel_order, &num_channels)); - - switch (image_channel_type) { -#define CASE(FROM, TO, SIZE) \ - case FROM: { \ - cuda_format = TO; \ - pixel_size_bytes = SIZE * num_channels; \ - break; \ - } - - CASE(UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8, CU_AD_FORMAT_UNSIGNED_INT8, 1) - CASE(UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8, CU_AD_FORMAT_SIGNED_INT8, 1) - CASE(UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16, CU_AD_FORMAT_UNSIGNED_INT16, 2) - CASE(UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16, CU_AD_FORMAT_SIGNED_INT16, 2) - CASE(UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT, CU_AD_FORMAT_HALF, 2) - CASE(UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32, CU_AD_FORMAT_UNSIGNED_INT32, 4) - CASE(UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32, CU_AD_FORMAT_SIGNED_INT32, 4) - CASE(UR_IMAGE_CHANNEL_TYPE_FLOAT, CU_AD_FORMAT_FLOAT, 4) - -#undef CASE - default: - break; - } - - // These new formats were brought in in CUDA 11.5 -#if CUDA_VERSION >= 11050 - - // If none of the above channel types were passed, check those below - if (pixel_size_bytes == 0) { - - // We can't use a switch statement here because these single - // UR_IMAGE_CHANNEL_TYPEs can correspond to multiple [u/s]norm CU_AD_FORMATs - // depending on the number of channels. We use a std::map instead to - // retrieve the correct CUDA format - - // map < , > - const std::map, - std::pair> - norm_channel_type_map{ - {{UR_IMAGE_CHANNEL_TYPE_UNORM_INT8, 1}, - {CU_AD_FORMAT_UNORM_INT8X1, 1}}, - {{UR_IMAGE_CHANNEL_TYPE_UNORM_INT8, 2}, - {CU_AD_FORMAT_UNORM_INT8X2, 2}}, - {{UR_IMAGE_CHANNEL_TYPE_UNORM_INT8, 4}, - {CU_AD_FORMAT_UNORM_INT8X4, 4}}, - - {{UR_IMAGE_CHANNEL_TYPE_SNORM_INT8, 1}, - {CU_AD_FORMAT_SNORM_INT8X1, 1}}, - {{UR_IMAGE_CHANNEL_TYPE_SNORM_INT8, 2}, - {CU_AD_FORMAT_SNORM_INT8X2, 2}}, - {{UR_IMAGE_CHANNEL_TYPE_SNORM_INT8, 4}, - {CU_AD_FORMAT_SNORM_INT8X4, 4}}, - - {{UR_IMAGE_CHANNEL_TYPE_UNORM_INT16, 1}, - {CU_AD_FORMAT_UNORM_INT16X1, 2}}, - {{UR_IMAGE_CHANNEL_TYPE_UNORM_INT16, 2}, - {CU_AD_FORMAT_UNORM_INT16X2, 4}}, - {{UR_IMAGE_CHANNEL_TYPE_UNORM_INT16, 4}, - {CU_AD_FORMAT_UNORM_INT16X4, 8}}, - - {{UR_IMAGE_CHANNEL_TYPE_SNORM_INT16, 1}, - {CU_AD_FORMAT_SNORM_INT16X1, 2}}, - {{UR_IMAGE_CHANNEL_TYPE_SNORM_INT16, 2}, - {CU_AD_FORMAT_SNORM_INT16X2, 4}}, - {{UR_IMAGE_CHANNEL_TYPE_SNORM_INT16, 4}, - {CU_AD_FORMAT_SNORM_INT16X4, 8}}, - }; - - try { - auto cuda_format_and_size = norm_channel_type_map.at( - std::make_pair(image_channel_type, num_channels)); - cuda_format = cuda_format_and_size.first; - pixel_size_bytes = cuda_format_and_size.second; - } catch (std::out_of_range &e) { - return UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED; - } - } - -#endif - - if (return_cuda_format) { - *return_cuda_format = cuda_format; - } - if (return_pixel_size_bytes) { - *return_pixel_size_bytes = pixel_size_bytes; - } - return UR_RESULT_SUCCESS; -} - -ur_result_t -cudaToUrImageChannelFormat(CUarray_format cuda_format, - ur_image_channel_type_t *return_image_channel_type) { - - switch (cuda_format) { -#define CUDA_TO_UR_IMAGE_CHANNEL_TYPE(FROM, TO) \ - case FROM: { \ - *return_image_channel_type = TO; \ - return UR_RESULT_SUCCESS; \ - } - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_UNSIGNED_INT8, - UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8); - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_UNSIGNED_INT16, - UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16); - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_UNSIGNED_INT32, - UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32); - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_SIGNED_INT8, - UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8); - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_SIGNED_INT16, - UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16); - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_SIGNED_INT32, - UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32); - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_HALF, - UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT); - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_FLOAT, - UR_IMAGE_CHANNEL_TYPE_FLOAT); -#if CUDA_VERSION >= 11050 - - // Note that the CUDA UNORM and SNORM formats also encode the number of - // channels. - // Since UR does not encode this, we map different CUDA formats to the same - // UR channel type. - // Since this function is only called from `urBindlessImagesImageGetInfoExp` - // which has access to `CUDA_ARRAY3D_DESCRIPTOR`, we can determine the - // number of channels in the calling function. - - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_UNORM_INT8X1, - UR_IMAGE_CHANNEL_TYPE_UNORM_INT8); - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_UNORM_INT8X2, - UR_IMAGE_CHANNEL_TYPE_UNORM_INT8); - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_UNORM_INT8X4, - UR_IMAGE_CHANNEL_TYPE_UNORM_INT8); - - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_UNORM_INT16X1, - UR_IMAGE_CHANNEL_TYPE_UNORM_INT16); - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_UNORM_INT16X2, - UR_IMAGE_CHANNEL_TYPE_UNORM_INT16); - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_UNORM_INT16X4, - UR_IMAGE_CHANNEL_TYPE_UNORM_INT16); - - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_SNORM_INT8X1, - UR_IMAGE_CHANNEL_TYPE_SNORM_INT8); - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_SNORM_INT8X2, - UR_IMAGE_CHANNEL_TYPE_SNORM_INT8); - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_SNORM_INT8X4, - UR_IMAGE_CHANNEL_TYPE_SNORM_INT8); - - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_SNORM_INT16X1, - UR_IMAGE_CHANNEL_TYPE_SNORM_INT16); - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_SNORM_INT16X2, - UR_IMAGE_CHANNEL_TYPE_SNORM_INT16); - CUDA_TO_UR_IMAGE_CHANNEL_TYPE(CU_AD_FORMAT_SNORM_INT16X4, - UR_IMAGE_CHANNEL_TYPE_SNORM_INT16); -#endif -#undef MAP - default: - return UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED; - } -} - -ur_result_t urTextureCreate(ur_sampler_handle_t hSampler, - const ur_image_desc_t *pImageDesc, - CUDA_RESOURCE_DESC ResourceDesc, - ur_exp_image_handle_t *phRetImage) { - - try { - /// pi_sampler_properties - /// | | - /// ----------------------------------- - /// | 31 30 ... 6 | N/A - /// | 5 | mip filter mode - /// | 4 3 2 | addressing mode - /// | 1 | filter mode - /// | 0 | normalize coords - CUDA_TEXTURE_DESC ImageTexDesc = {}; - CUaddress_mode AddrMode = {}; - ur_sampler_addressing_mode_t AddrModeProp = hSampler->getAddressingMode(); - if (AddrModeProp == (UR_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE - - UR_SAMPLER_ADDRESSING_MODE_NONE)) { - AddrMode = CU_TR_ADDRESS_MODE_CLAMP; - } else if (AddrModeProp == (UR_SAMPLER_ADDRESSING_MODE_CLAMP - - UR_SAMPLER_ADDRESSING_MODE_NONE)) { - AddrMode = CU_TR_ADDRESS_MODE_BORDER; - } else if (AddrModeProp == (UR_SAMPLER_ADDRESSING_MODE_REPEAT - - UR_SAMPLER_ADDRESSING_MODE_NONE)) { - AddrMode = CU_TR_ADDRESS_MODE_WRAP; - } else if (AddrModeProp == (UR_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT - - UR_SAMPLER_ADDRESSING_MODE_NONE)) { - AddrMode = CU_TR_ADDRESS_MODE_MIRROR; - } - CUfilter_mode FilterMode; - ur_sampler_filter_mode_t FilterModeProp = hSampler->getFilterMode(); - FilterMode = - FilterModeProp ? CU_TR_FILTER_MODE_LINEAR : CU_TR_FILTER_MODE_POINT; - ImageTexDesc.filterMode = FilterMode; - - // Mipmap attributes - CUfilter_mode MipFilterMode; - ur_sampler_filter_mode_t MipFilterModeProp = hSampler->getMipFilterMode(); - MipFilterMode = - MipFilterModeProp ? CU_TR_FILTER_MODE_LINEAR : CU_TR_FILTER_MODE_POINT; - ImageTexDesc.mipmapFilterMode = MipFilterMode; - ImageTexDesc.maxMipmapLevelClamp = hSampler->MaxMipmapLevelClamp; - ImageTexDesc.minMipmapLevelClamp = hSampler->MinMipmapLevelClamp; - ImageTexDesc.maxAnisotropy = hSampler->MaxAnisotropy; - - // The address modes can interfere with other dimensionsenqueueEventsWait - // e.g. 1D texture sampling can be interfered with when setting other - // dimension address modes despite their nonexistence. - ImageTexDesc.addressMode[0] = AddrMode; // 1D - ImageTexDesc.addressMode[1] = - pImageDesc->height > 0 ? AddrMode : ImageTexDesc.addressMode[1]; // 2D - ImageTexDesc.addressMode[2] = - pImageDesc->depth > 0 ? AddrMode : ImageTexDesc.addressMode[2]; // 3D - - // flags takes the normalized coordinates setting -- unnormalized is default - ImageTexDesc.flags = (hSampler->isNormalizedCoords()) - ? CU_TRSF_NORMALIZED_COORDINATES - : ImageTexDesc.flags; - - // CUDA default promotes 8-bit and 16-bit integers to float between [0,1] - // This flag prevents this behaviour. - ImageTexDesc.flags |= CU_TRSF_READ_AS_INTEGER; - - CUtexObject Texture; - UR_CHECK_ERROR( - cuTexObjectCreate(&Texture, &ResourceDesc, &ImageTexDesc, nullptr)); - *phRetImage = (ur_exp_image_handle_t)Texture; - } catch (ur_result_t Err) { - return Err; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urUSMPitchedAllocExp( - ur_context_handle_t hContext, ur_device_handle_t hDevice, - const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool, - size_t widthInBytes, size_t height, size_t elementSizeBytes, void **ppMem, - size_t *pResultPitch) { - UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), - UR_RESULT_ERROR_INVALID_CONTEXT); - std::ignore = pUSMDesc; - std::ignore = pool; - - UR_ASSERT((widthInBytes > 0), UR_RESULT_ERROR_INVALID_VALUE); - UR_ASSERT((height > 0), UR_RESULT_ERROR_INVALID_VALUE); - UR_ASSERT((elementSizeBytes > 0), UR_RESULT_ERROR_INVALID_VALUE); - - // elementSizeBytes can only take on values of 4, 8, or 16. - // small data types need to be minimised to 4. - if (elementSizeBytes < 4) { - elementSizeBytes = 4; - } - UR_ASSERT((elementSizeBytes == 4 || elementSizeBytes == 8 || - elementSizeBytes == 16), - UR_RESULT_ERROR_INVALID_VALUE); - ur_result_t Result = UR_RESULT_SUCCESS; - try { - ScopedContext Active(hDevice->getContext()); - UR_CHECK_ERROR(cuMemAllocPitch((CUdeviceptr *)ppMem, pResultPitch, - widthInBytes, height, elementSizeBytes)); - } catch (ur_result_t error) { - Result = error; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } - - return Result; -} - -UR_APIEXPORT ur_result_t UR_APICALL -urBindlessImagesUnsampledImageHandleDestroyExp(ur_context_handle_t hContext, - ur_device_handle_t hDevice, - ur_exp_image_handle_t hImage) { - UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), - UR_RESULT_ERROR_INVALID_CONTEXT); - - UR_CHECK_ERROR(cuSurfObjectDestroy((CUsurfObject)hImage)); - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL -urBindlessImagesSampledImageHandleDestroyExp(ur_context_handle_t hContext, - ur_device_handle_t hDevice, - ur_exp_image_handle_t hImage) { - UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), - UR_RESULT_ERROR_INVALID_CONTEXT); - - UR_CHECK_ERROR(cuTexObjectDestroy((CUtexObject)hImage)); - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageAllocateExp( - ur_context_handle_t hContext, ur_device_handle_t hDevice, - const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, - ur_exp_image_mem_handle_t *phImageMem) { - UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), - UR_RESULT_ERROR_INVALID_CONTEXT); - - // Populate descriptor - CUDA_ARRAY3D_DESCRIPTOR array_desc = {}; - - UR_CHECK_ERROR(urCalculateNumChannels(pImageFormat->channelOrder, - &array_desc.NumChannels)); - - UR_CHECK_ERROR(urToCudaImageChannelFormat(pImageFormat->channelType, - pImageFormat->channelOrder, - &array_desc.Format, nullptr)); - - array_desc.Flags = 0; // No flags required - array_desc.Width = pImageDesc->width; - if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) { - array_desc.Height = 0; - array_desc.Depth = 0; - } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) { - array_desc.Height = pImageDesc->height; - array_desc.Depth = 0; - } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) { - array_desc.Height = pImageDesc->height; - array_desc.Depth = pImageDesc->depth; - } - - ScopedContext Active(hDevice->getContext()); - - // Allocate a cuArray - if (pImageDesc->numMipLevel == 1) { - CUarray ImageArray; - - try { - UR_CHECK_ERROR(cuArray3DCreate(&ImageArray, &array_desc)); - *phImageMem = (ur_exp_image_mem_handle_t)ImageArray; - } catch (ur_result_t Err) { - cuArrayDestroy(ImageArray); - return Err; - } catch (...) { - cuArrayDestroy(ImageArray); - return UR_RESULT_ERROR_UNKNOWN; - } - } else // Allocate a cuMipmappedArray - { - CUmipmappedArray mip_array; - array_desc.Flags = CUDA_ARRAY3D_SURFACE_LDST; - - try { - UR_CHECK_ERROR(cuMipmappedArrayCreate(&mip_array, &array_desc, - pImageDesc->numMipLevel)); - *phImageMem = (ur_exp_image_mem_handle_t)mip_array; - } catch (ur_result_t Err) { - cuMipmappedArrayDestroy(mip_array); - return Err; - } catch (...) { - cuMipmappedArrayDestroy(mip_array); - return UR_RESULT_ERROR_UNKNOWN; - } - } - - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageFreeExp( - ur_context_handle_t hContext, ur_device_handle_t hDevice, - ur_exp_image_mem_handle_t hImageMem) { - UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), - UR_RESULT_ERROR_INVALID_CONTEXT); - - ScopedContext Active(hDevice->getContext()); - try { - UR_CHECK_ERROR(cuArrayDestroy((CUarray)hImageMem)); - } catch (ur_result_t Err) { - return Err; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( - ur_context_handle_t hContext, ur_device_handle_t hDevice, - ur_exp_image_mem_handle_t hImageMem, const ur_image_format_t *pImageFormat, - const ur_image_desc_t *pImageDesc, ur_mem_handle_t *phMem, - ur_exp_image_handle_t *phImage) { - UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), - UR_RESULT_ERROR_INVALID_CONTEXT); - - unsigned int NumChannels = 0; - UR_CHECK_ERROR( - urCalculateNumChannels(pImageFormat->channelOrder, &NumChannels)); - - CUarray_format format; - size_t PixelSizeBytes; - UR_CHECK_ERROR(urToCudaImageChannelFormat(pImageFormat->channelType, - pImageFormat->channelOrder, &format, - &PixelSizeBytes)); - - try { - - ScopedContext Active(hDevice->getContext()); - - CUDA_RESOURCE_DESC image_res_desc = {}; - - // We have a CUarray - image_res_desc.resType = CU_RESOURCE_TYPE_ARRAY; - image_res_desc.res.array.hArray = (CUarray)hImageMem; - - // We create surfaces in the unsampled images case as it conforms to how - // CUDA deals with unsampled images. - CUsurfObject surface; - UR_CHECK_ERROR(cuSurfObjectCreate(&surface, &image_res_desc)); - *phImage = (ur_exp_image_handle_t)surface; - - auto urMemObj = std::unique_ptr(new ur_mem_handle_t_{ - hContext, (CUarray)hImageMem, surface, pImageDesc->type}); - - if (urMemObj == nullptr) { - return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; - } - - *phMem = urMemObj.release(); - - } catch (ur_result_t Err) { - return Err; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } - - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( - ur_context_handle_t hContext, ur_device_handle_t hDevice, - ur_exp_image_mem_handle_t hImageMem, const ur_image_format_t *pImageFormat, - const ur_image_desc_t *pImageDesc, ur_sampler_handle_t hSampler, - ur_mem_handle_t *phMem, ur_exp_image_handle_t *phImage) { - UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), - UR_RESULT_ERROR_INVALID_CONTEXT); - - ScopedContext Active(hDevice->getContext()); - - unsigned int NumChannels = 0; - UR_CHECK_ERROR( - urCalculateNumChannels(pImageFormat->channelOrder, &NumChannels)); - - CUarray_format format; - size_t PixelSizeBytes; - UR_CHECK_ERROR(urToCudaImageChannelFormat(pImageFormat->channelType, - pImageFormat->channelOrder, &format, - &PixelSizeBytes)); - - try { - CUDA_RESOURCE_DESC image_res_desc = {}; - - unsigned int mem_type; - // If this function doesn't return successfully, we assume that hImageMem is - // a CUarray or CUmipmappedArray. If this function returns successfully, we - // check whether hImageMem is device memory (even managed memory isn't - // considered shared). - CUresult Err = cuPointerGetAttribute( - &mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)hImageMem); - if (Err != CUDA_SUCCESS) { - // We have a CUarray - if (pImageDesc->numMipLevel == 1) { - image_res_desc.resType = CU_RESOURCE_TYPE_ARRAY; - image_res_desc.res.array.hArray = (CUarray)hImageMem; - } - // We have a CUmipmappedArray - else { - image_res_desc.resType = CU_RESOURCE_TYPE_MIPMAPPED_ARRAY; - image_res_desc.res.mipmap.hMipmappedArray = (CUmipmappedArray)hImageMem; - } - } else if (mem_type == CU_MEMORYTYPE_DEVICE) { - // We have a USM pointer - if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) { - image_res_desc.resType = CU_RESOURCE_TYPE_LINEAR; - image_res_desc.res.linear.devPtr = (CUdeviceptr)hImageMem; - image_res_desc.res.linear.format = format; - image_res_desc.res.linear.numChannels = NumChannels; - image_res_desc.res.linear.sizeInBytes = - pImageDesc->width * PixelSizeBytes; - } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) { - image_res_desc.resType = CU_RESOURCE_TYPE_PITCH2D; - image_res_desc.res.pitch2D.devPtr = (CUdeviceptr)hImageMem; - image_res_desc.res.pitch2D.format = format; - image_res_desc.res.pitch2D.numChannels = NumChannels; - image_res_desc.res.pitch2D.width = pImageDesc->width; - image_res_desc.res.pitch2D.height = pImageDesc->height; - image_res_desc.res.pitch2D.pitchInBytes = pImageDesc->rowPitch; - } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) { - // Cannot create 3D image from USM. - return UR_RESULT_ERROR_INVALID_VALUE; - } - } else { - // Unknown image memory type. - return UR_RESULT_ERROR_INVALID_VALUE; - } - - UR_CHECK_ERROR( - urTextureCreate(hSampler, pImageDesc, image_res_desc, phImage)); - - auto urMemObj = std::unique_ptr(new ur_mem_handle_t_{ - hContext, (CUarray)hImageMem, (CUtexObject)*phImage, hSampler, - pImageDesc->type}); - - if (urMemObj == nullptr) { - return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; - } - - *phMem = urMemObj.release(); - } catch (ur_result_t Err) { - return Err; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } - - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( - ur_queue_handle_t hQueue, void *pDst, void *pSrc, - const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, - ur_exp_image_copy_flags_t imageCopyFlags, ur_rect_offset_t srcOffset, - ur_rect_offset_t dstOffset, ur_rect_region_t copyExtent, - ur_rect_region_t hostExtent, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - UR_ASSERT((imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_HOST_TO_DEVICE || - imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_HOST || - imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_DEVICE), - UR_RESULT_ERROR_INVALID_VALUE); - - unsigned int NumChannels = 0; - size_t PixelSizeBytes = 0; - - UR_CHECK_ERROR( - urCalculateNumChannels(pImageFormat->channelOrder, &NumChannels)); - - // We need to get this now in bytes for calculating the total image size - // later. - UR_CHECK_ERROR(urToCudaImageChannelFormat(pImageFormat->channelType, - pImageFormat->channelOrder, nullptr, - &PixelSizeBytes)); - - try { - ScopedContext Active(hQueue->getContext()); - CUstream Stream = hQueue->getNextTransferStream(); - enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList); - // We have to use a different copy function for each image dimensionality. - - if (imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_HOST_TO_DEVICE) { - if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) { - size_t CopyExtentBytes = PixelSizeBytes * copyExtent.width; - char *SrcWithOffset = (char *)pSrc + (srcOffset.x * PixelSizeBytes); - UR_CHECK_ERROR( - cuMemcpyHtoAAsync((CUarray)pDst, dstOffset.x * PixelSizeBytes, - (void *)SrcWithOffset, CopyExtentBytes, Stream)); - } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) { - CUDA_MEMCPY2D cpy_desc = {}; - cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; - cpy_desc.srcHost = pSrc; - cpy_desc.srcXInBytes = srcOffset.x * PixelSizeBytes; - cpy_desc.srcY = srcOffset.y; - cpy_desc.dstXInBytes = dstOffset.x * PixelSizeBytes; - cpy_desc.dstY = dstOffset.y; - cpy_desc.srcPitch = hostExtent.width * PixelSizeBytes; - if (pImageDesc->rowPitch == 0) { - cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; - cpy_desc.dstArray = (CUarray)pDst; - } else { - // Pitched memory - cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_DEVICE; - cpy_desc.dstDevice = (CUdeviceptr)pDst; - cpy_desc.dstPitch = pImageDesc->rowPitch; - } - cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width; - cpy_desc.Height = copyExtent.height; - UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream)); - } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) { - CUDA_MEMCPY3D cpy_desc = {}; - cpy_desc.srcXInBytes = srcOffset.x * PixelSizeBytes; - cpy_desc.srcY = srcOffset.y; - cpy_desc.srcZ = srcOffset.z; - cpy_desc.dstXInBytes = dstOffset.x * PixelSizeBytes; - cpy_desc.dstY = dstOffset.y; - cpy_desc.dstZ = dstOffset.z; - cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; - cpy_desc.srcHost = pSrc; - cpy_desc.srcPitch = hostExtent.width * PixelSizeBytes; - cpy_desc.srcHeight = hostExtent.height; - cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; - cpy_desc.dstArray = (CUarray)pDst; - cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width; - cpy_desc.Height = copyExtent.height; - cpy_desc.Depth = copyExtent.depth; - UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream)); - } - } else if (imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_HOST) { - if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) { - size_t CopyExtentBytes = PixelSizeBytes * copyExtent.width; - size_t src_offset_bytes = PixelSizeBytes * srcOffset.x; - void *dst_with_offset = - (void *)((char *)pDst + (PixelSizeBytes * dstOffset.x)); - UR_CHECK_ERROR(cuMemcpyAtoHAsync(dst_with_offset, (CUarray)pSrc, - src_offset_bytes, CopyExtentBytes, - Stream)); - } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) { - CUDA_MEMCPY2D cpy_desc = {}; - cpy_desc.srcXInBytes = srcOffset.x; - cpy_desc.srcY = srcOffset.y; - cpy_desc.dstXInBytes = dstOffset.x; - cpy_desc.dstY = dstOffset.y; - cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; - cpy_desc.dstHost = pDst; - if (pImageDesc->rowPitch == 0) { - cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; - cpy_desc.srcArray = (CUarray)pSrc; - } else { - // Pitched memory - cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_DEVICE; - cpy_desc.srcPitch = pImageDesc->rowPitch; - cpy_desc.srcDevice = (CUdeviceptr)pSrc; - } - cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width; - cpy_desc.Height = copyExtent.height; - UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream)); - } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) { - CUDA_MEMCPY3D cpy_desc = {}; - cpy_desc.srcXInBytes = srcOffset.x; - cpy_desc.srcY = srcOffset.y; - cpy_desc.srcZ = srcOffset.z; - cpy_desc.dstXInBytes = dstOffset.x; - cpy_desc.dstY = dstOffset.y; - cpy_desc.dstZ = dstOffset.z; - cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; - cpy_desc.srcArray = (CUarray)pSrc; - cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; - cpy_desc.dstHost = pDst; - cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width; - cpy_desc.Height = copyExtent.height; - cpy_desc.Depth = copyExtent.depth; - UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream)); - } - } else { - /// imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_DEVICE - /// TODO: implemet device to device copy - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - if (phEvent) { - auto NewEvent = ur_event_handle_t_::makeNative(UR_COMMAND_MEM_IMAGE_COPY, - hQueue, Stream); - NewEvent->record(); - *phEvent = NewEvent; - } - } catch (ur_result_t Err) { - return Err; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } - - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageGetInfoExp( - ur_exp_image_mem_handle_t hImageMem, ur_image_info_t propName, - void *pPropValue, size_t *pPropSizeRet) { - - CUDA_ARRAY3D_DESCRIPTOR ArrayDesc; - UR_CHECK_ERROR(cuArray3DGetDescriptor(&ArrayDesc, (CUarray)hImageMem)); - switch (propName) { - case UR_IMAGE_INFO_WIDTH: - if (pPropValue) { - *(size_t *)pPropValue = ArrayDesc.Width; - } - if (pPropSizeRet) { - *pPropSizeRet = sizeof(size_t); - } - return UR_RESULT_SUCCESS; - case UR_IMAGE_INFO_HEIGHT: - if (pPropValue) { - *(size_t *)pPropValue = ArrayDesc.Height; - } - if (pPropSizeRet) { - *pPropSizeRet = sizeof(size_t); - } - return UR_RESULT_SUCCESS; - case UR_IMAGE_INFO_DEPTH: - if (pPropValue) { - *(size_t *)pPropValue = ArrayDesc.Depth; - } - if (pPropSizeRet) { - *pPropSizeRet = sizeof(size_t); - } - return UR_RESULT_SUCCESS; - case UR_IMAGE_INFO_FORMAT: - ur_image_channel_type_t ChannelType; - ur_image_channel_order_t ChannelOrder; - UR_CHECK_ERROR(cudaToUrImageChannelFormat(ArrayDesc.Format, &ChannelType)); - // CUDA does not have a notion of channel "order" in the same way that - // SYCL 1.2.1 does. - switch (ArrayDesc.NumChannels) { - case 1: - ChannelOrder = UR_IMAGE_CHANNEL_ORDER_R; - break; - case 2: - ChannelOrder = UR_IMAGE_CHANNEL_ORDER_RG; - break; - case 4: - ChannelOrder = UR_IMAGE_CHANNEL_ORDER_RGBA; - break; - } - if (pPropValue) { - ((ur_image_format_t *)pPropValue)->channelType = ChannelType; - ((ur_image_format_t *)pPropValue)->channelOrder = ChannelOrder; - } - if (pPropSizeRet) { - *pPropSizeRet = sizeof(ur_image_format_t); - } - return UR_RESULT_SUCCESS; - default: - return UR_RESULT_ERROR_INVALID_VALUE; - } -} - -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMipmapGetLevelExp( - ur_context_handle_t hContext, ur_device_handle_t hDevice, - ur_exp_image_mem_handle_t hImageMem, uint32_t mipmapLevel, - ur_exp_image_mem_handle_t *phImageMem) { - UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), - UR_RESULT_ERROR_INVALID_CONTEXT); - - try { - ScopedContext Active(hDevice->getContext()); - CUarray ImageArray; - UR_CHECK_ERROR(cuMipmappedArrayGetLevel( - &ImageArray, (CUmipmappedArray)hImageMem, mipmapLevel)); - *phImageMem = (ur_exp_image_mem_handle_t)ImageArray; - } catch (ur_result_t Err) { - return Err; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } - - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMipmapFreeExp( - ur_context_handle_t hContext, ur_device_handle_t hDevice, - ur_exp_image_mem_handle_t hMem) { - UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), - UR_RESULT_ERROR_INVALID_CONTEXT); - - ScopedContext Active(hDevice->getContext()); - try { - UR_CHECK_ERROR(cuMipmappedArrayDestroy((CUmipmappedArray)hMem)); - } catch (ur_result_t Err) { - return Err; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImportOpaqueFDExp( - ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size, - ur_exp_interop_mem_desc_t *pInteropMemDesc, - ur_exp_interop_mem_handle_t *phInteropMem) { - UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), - UR_RESULT_ERROR_INVALID_CONTEXT); - - try { - ScopedContext Active(hDevice->getContext()); - - CUDA_EXTERNAL_MEMORY_HANDLE_DESC extMemDesc = {}; - extMemDesc.size = size; - - void *pNext = const_cast(pInteropMemDesc->pNext); - while (pNext != nullptr) { - const ur_base_desc_t *BaseDesc = - reinterpret_cast(pNext); - if (BaseDesc->stype == UR_STRUCTURE_TYPE_EXP_FILE_DESCRIPTOR) { - const ur_exp_file_descriptor_t *FileDescriptor = - reinterpret_cast(pNext); - - extMemDesc.handle.fd = FileDescriptor->fd; - extMemDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD; - } else if (BaseDesc->stype == UR_STRUCTURE_TYPE_EXP_WIN32_HANDLE) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - pNext = const_cast(BaseDesc->pNext); - } - - CUexternalMemory extMem; - UR_CHECK_ERROR(cuImportExternalMemory(&extMem, &extMemDesc)); - *phInteropMem = (ur_exp_interop_mem_handle_t)extMem; - - } catch (ur_result_t Err) { - return Err; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } - - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( - ur_context_handle_t hContext, ur_device_handle_t hDevice, - const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, - ur_exp_interop_mem_handle_t hInteropMem, - ur_exp_image_mem_handle_t *phImageMem) { - UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), - UR_RESULT_ERROR_INVALID_CONTEXT); - - unsigned int NumChannels = 0; - UR_CHECK_ERROR( - urCalculateNumChannels(pImageFormat->channelOrder, &NumChannels)); - - CUarray_format format; - UR_CHECK_ERROR(urToCudaImageChannelFormat( - pImageFormat->channelType, pImageFormat->channelOrder, &format, nullptr)); - - try { - ScopedContext Active(hDevice->getContext()); - - CUDA_ARRAY3D_DESCRIPTOR ArrayDesc = {}; - ArrayDesc.Width = pImageDesc->width; - ArrayDesc.Height = pImageDesc->height; - ArrayDesc.Depth = pImageDesc->depth; - ArrayDesc.NumChannels = NumChannels; - ArrayDesc.Format = format; - - CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC mipmapDesc = {}; - mipmapDesc.numLevels = 1; - mipmapDesc.arrayDesc = ArrayDesc; - - CUmipmappedArray memMipMap; - UR_CHECK_ERROR(cuExternalMemoryGetMappedMipmappedArray( - &memMipMap, (CUexternalMemory)hInteropMem, &mipmapDesc)); - - CUarray memArray; - UR_CHECK_ERROR(cuMipmappedArrayGetLevel(&memArray, memMipMap, 0)); - - *phImageMem = (ur_exp_image_mem_handle_t)memArray; - - } catch (ur_result_t Err) { - return Err; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseInteropExp( - ur_context_handle_t hContext, ur_device_handle_t hDevice, - ur_exp_interop_mem_handle_t hInteropMem) { - UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), - UR_RESULT_ERROR_INVALID_CONTEXT); - - try { - ScopedContext Active(hDevice->getContext()); - UR_CHECK_ERROR(cuDestroyExternalMemory((CUexternalMemory)hInteropMem)); - } catch (ur_result_t Err) { - return Err; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL -urBindlessImagesImportExternalSemaphoreOpaqueFDExp( - ur_context_handle_t hContext, ur_device_handle_t hDevice, - ur_exp_interop_semaphore_desc_t *pInteropSemaphoreDesc, - ur_exp_interop_semaphore_handle_t *phInteropSemaphoreHandle) { - UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), - UR_RESULT_ERROR_INVALID_CONTEXT); - - try { - ScopedContext Active(hDevice->getContext()); - - CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC extSemDesc = {}; - - void *pNext = const_cast(pInteropSemaphoreDesc->pNext); - while (pNext != nullptr) { - const ur_base_desc_t *BaseDesc = - reinterpret_cast(pNext); - if (BaseDesc->stype == UR_STRUCTURE_TYPE_EXP_FILE_DESCRIPTOR) { - const ur_exp_file_descriptor_t *FileDescriptor = - reinterpret_cast(pNext); - - extSemDesc.handle.fd = FileDescriptor->fd; - extSemDesc.type = CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD; - } else if (BaseDesc->stype == UR_STRUCTURE_TYPE_EXP_WIN32_HANDLE) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - pNext = const_cast(BaseDesc->pNext); - } - - CUexternalSemaphore semaphore; - UR_CHECK_ERROR(cuImportExternalSemaphore(&semaphore, &extSemDesc)); - - *phInteropSemaphoreHandle = (ur_exp_interop_semaphore_handle_t)semaphore; - } catch (ur_result_t Err) { - return Err; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesDestroyExternalSemaphoreExp( - ur_context_handle_t hContext, ur_device_handle_t hDevice, - ur_exp_interop_semaphore_handle_t hInteropSemaphore) { - UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), - UR_RESULT_ERROR_INVALID_CONTEXT); - - try { - ScopedContext Active(hDevice->getContext()); - UR_CHECK_ERROR( - cuDestroyExternalSemaphore((CUexternalSemaphore)hInteropSemaphore)); - } catch (ur_result_t Err) { - return Err; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( - ur_queue_handle_t hQueue, ur_exp_interop_semaphore_handle_t hSemaphore, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - - try { - ScopedContext Active(hQueue->getContext()); - CUstream Stream = hQueue->getNextTransferStream(); - - enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList); - - CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS SemWaitParams = {}; - - // Wait for one external semaphore - UR_CHECK_ERROR(cuWaitExternalSemaphoresAsync( - (CUexternalSemaphore *)&hSemaphore, &SemWaitParams, 1 /* numExtSems */, - Stream)); - - if (phEvent) { - auto NewEvent = ur_event_handle_t_::makeNative( - UR_COMMAND_INTEROP_SEMAPHORE_WAIT_EXP, hQueue, Stream); - NewEvent->record(); - *phEvent = NewEvent; - } - } catch (ur_result_t Err) { - return Err; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( - ur_queue_handle_t hQueue, ur_exp_interop_semaphore_handle_t hSemaphore, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - - try { - ScopedContext Active(hQueue->getContext()); - CUstream Stream = hQueue->getNextTransferStream(); - - enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList); - - CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS SemSignalParams = {}; - - // Signal one external semaphore - UR_CHECK_ERROR(cuSignalExternalSemaphoresAsync( - (CUexternalSemaphore *)&hSemaphore, &SemSignalParams, - 1 /* numExtSems */, Stream)); - - if (phEvent) { - auto NewEvent = ur_event_handle_t_::makeNative( - UR_COMMAND_INTEROP_SEMAPHORE_SIGNAL_EXP, hQueue, Stream); - NewEvent->record(); - *phEvent = NewEvent; - } - } catch (ur_result_t Err) { - return Err; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } - return UR_RESULT_SUCCESS; -} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/image.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/image.hpp deleted file mode 100644 index af1d9fd194893..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/image.hpp +++ /dev/null @@ -1,32 +0,0 @@ -//===--------- image.hpp - CUDA Adapter -----------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#pragma once - -#include -#include -#include - -#include "common.hpp" -ur_result_t urCalculateNumChannels(ur_image_channel_order_t order, - unsigned int *num_channels); - -ur_result_t -urToCudaImageChannelFormat(ur_image_channel_type_t image_channel_type, - ur_image_channel_order_t image_channel_order, - CUarray_format *return_cuda_format, - size_t *return_pixel_types_size_bytes); - -ur_result_t -cudaToUrImageChannelFormat(CUarray_format cuda_format, - ur_image_channel_type_t *return_image_channel_type); - -ur_result_t urTextureCreate(ur_context_handle_t hContext, - ur_sampler_desc_t SamplerDesc, - const ur_image_desc_t *pImageDesc, - CUDA_RESOURCE_DESC ResourceDesc, - ur_exp_image_handle_t *phRetImage); diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp deleted file mode 100644 index e2fa09e4ddece..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp +++ /dev/null @@ -1,374 +0,0 @@ -//===--------- kernel.cpp - CUDA Adapter ----------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "kernel.hpp" -#include "memory.hpp" -#include "sampler.hpp" - -UR_APIEXPORT ur_result_t UR_APICALL -urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName, - ur_kernel_handle_t *phKernel) { - ur_result_t Result = UR_RESULT_SUCCESS; - std::unique_ptr Kernel{nullptr}; - - try { - ScopedContext Active(hProgram->getContext()); - - CUfunction CuFunc; - CUresult FunctionResult = - cuModuleGetFunction(&CuFunc, hProgram->get(), pKernelName); - - // We can't add this as a generic mapping in UR_CHECK_ERROR since cuda's - // NOT_FOUND error applies to more than just functions. - if (FunctionResult == CUDA_ERROR_NOT_FOUND) { - throw UR_RESULT_ERROR_INVALID_KERNEL_NAME; - } else { - UR_CHECK_ERROR(FunctionResult); - } - - std::string KernelNameWithOffset = - std::string(pKernelName) + "_with_offset"; - CUfunction CuFuncWithOffsetParam; - CUresult OffsetRes = cuModuleGetFunction( - &CuFuncWithOffsetParam, hProgram->get(), KernelNameWithOffset.c_str()); - - // If there is no kernel with global offset parameter we mark it as missing - if (OffsetRes == CUDA_ERROR_NOT_FOUND) { - CuFuncWithOffsetParam = nullptr; - } else { - UR_CHECK_ERROR(OffsetRes); - } - Kernel = std::unique_ptr( - new ur_kernel_handle_t_{CuFunc, CuFuncWithOffsetParam, pKernelName, - hProgram, hProgram->getContext()}); - } catch (ur_result_t Err) { - Result = Err; - } catch (...) { - Result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; - } - - *phKernel = Kernel.release(); - return Result; -} - -UR_APIEXPORT ur_result_t UR_APICALL -urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, - ur_kernel_group_info_t propName, size_t propSize, - void *pPropValue, size_t *pPropSizeRet) { - UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); - - switch (propName) { - case UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: { - size_t GlobalWorkSize[3] = {0, 0, 0}; - - int MaxBlockDimX{0}, MaxBlockDimY{0}, MaxBlockDimZ{0}; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &MaxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, hDevice->get())); - UR_CHECK_ERROR(cuDeviceGetAttribute( - &MaxBlockDimY, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, hDevice->get())); - UR_CHECK_ERROR(cuDeviceGetAttribute( - &MaxBlockDimZ, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, hDevice->get())); - - int MaxGridDimX{0}, MaxGridDimY{0}, MaxGridDimZ{0}; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, hDevice->get())); - UR_CHECK_ERROR(cuDeviceGetAttribute( - &MaxGridDimY, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, hDevice->get())); - UR_CHECK_ERROR(cuDeviceGetAttribute( - &MaxGridDimZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, hDevice->get())); - - GlobalWorkSize[0] = MaxBlockDimX * MaxGridDimX; - GlobalWorkSize[1] = MaxBlockDimY * MaxGridDimY; - GlobalWorkSize[2] = MaxBlockDimZ * MaxGridDimZ; - return ReturnValue(GlobalWorkSize, 3); - } - case UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { - int MaxThreads = 0; - UR_CHECK_ERROR(cuFuncGetAttribute( - &MaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, hKernel->get())); - return ReturnValue(size_t(MaxThreads)); - } - case UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: { - size_t GroupSize[3] = {0, 0, 0}; - const auto &ReqdWGSizeMDMap = - hKernel->get_program()->KernelReqdWorkGroupSizeMD; - const auto ReqdWGSizeMD = ReqdWGSizeMDMap.find(hKernel->getName()); - if (ReqdWGSizeMD != ReqdWGSizeMDMap.end()) { - const auto ReqdWGSize = ReqdWGSizeMD->second; - GroupSize[0] = std::get<0>(ReqdWGSize); - GroupSize[1] = std::get<1>(ReqdWGSize); - GroupSize[2] = std::get<2>(ReqdWGSize); - } - return ReturnValue(GroupSize, 3); - } - case UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: { - // OpenCL LOCAL == CUDA SHARED - int Bytes = 0; - UR_CHECK_ERROR(cuFuncGetAttribute( - &Bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, hKernel->get())); - return ReturnValue(uint64_t(Bytes)); - } - case UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: { - // Work groups should be multiples of the warp size - int WarpSize = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, hDevice->get())); - return ReturnValue(static_cast(WarpSize)); - } - case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: { - // OpenCL PRIVATE == CUDA LOCAL - int Bytes = 0; - UR_CHECK_ERROR(cuFuncGetAttribute( - &Bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, hKernel->get())); - return ReturnValue(uint64_t(Bytes)); - } - default: - break; - } - - return UR_RESULT_ERROR_INVALID_ENUMERATION; -} - -UR_APIEXPORT ur_result_t UR_APICALL urKernelRetain(ur_kernel_handle_t hKernel) { - UR_ASSERT(hKernel->getReferenceCount() > 0u, UR_RESULT_ERROR_INVALID_KERNEL); - - hKernel->incrementReferenceCount(); - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL -urKernelRelease(ur_kernel_handle_t hKernel) { - // double delete or someone is messing with the ref count. - // either way, cannot safely proceed. - UR_ASSERT(hKernel->getReferenceCount() != 0, UR_RESULT_ERROR_INVALID_KERNEL); - - // decrement ref count. If it is 0, delete the program. - if (hKernel->decrementReferenceCount() == 0) { - // no internal cuda resources to clean up. Just delete it. - delete hKernel; - return UR_RESULT_SUCCESS; - } - - return UR_RESULT_SUCCESS; -} - -// TODO(ur): Not implemented on cuda atm. Also, need to add tests for this -// feature. -UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle( - ur_kernel_handle_t hKernel, ur_native_handle_t *phNativeKernel) { - (void)hKernel; - (void)phNativeKernel; - - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue( - ur_kernel_handle_t hKernel, uint32_t argIndex, size_t argSize, - const ur_kernel_arg_value_properties_t *pProperties, - const void *pArgValue) { - std::ignore = pProperties; - UR_ASSERT(argSize, UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE); - - ur_result_t Result = UR_RESULT_SUCCESS; - try { - hKernel->setKernelArg(argIndex, argSize, pArgValue); - } catch (ur_result_t Err) { - Result = Err; - } - return Result; -} - -UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgLocal( - ur_kernel_handle_t hKernel, uint32_t argIndex, size_t argSize, - const ur_kernel_arg_local_properties_t *pProperties) { - std::ignore = pProperties; - UR_ASSERT(argSize, UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE); - - ur_result_t Result = UR_RESULT_SUCCESS; - try { - hKernel->setKernelLocalArg(argIndex, argSize); - } catch (ur_result_t Err) { - Result = Err; - } - return Result; -} - -UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel, - ur_kernel_info_t propName, - size_t propSize, - void *pKernelInfo, - size_t *pPropSizeRet) { - UrReturnHelper ReturnValue(propSize, pKernelInfo, pPropSizeRet); - - switch (propName) { - case UR_KERNEL_INFO_FUNCTION_NAME: - return ReturnValue(hKernel->getName()); - case UR_KERNEL_INFO_NUM_ARGS: - return ReturnValue(hKernel->getNumArgs()); - case UR_KERNEL_INFO_REFERENCE_COUNT: - return ReturnValue(hKernel->getReferenceCount()); - case UR_KERNEL_INFO_CONTEXT: - return ReturnValue(hKernel->getContext()); - case UR_KERNEL_INFO_PROGRAM: - return ReturnValue(hKernel->get_program()); - case UR_KERNEL_INFO_ATTRIBUTES: - return ReturnValue(""); - case UR_KERNEL_INFO_NUM_REGS: { - int NumRegs = 0; - UR_CHECK_ERROR(cuFuncGetAttribute(&NumRegs, CU_FUNC_ATTRIBUTE_NUM_REGS, - hKernel->get())); - return ReturnValue(static_cast(NumRegs)); - } - default: - break; - } - - return UR_RESULT_ERROR_INVALID_ENUMERATION; -} - -UR_APIEXPORT ur_result_t UR_APICALL -urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, - ur_kernel_sub_group_info_t propName, size_t propSize, - void *pPropValue, size_t *pPropSizeRet) { - UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); - switch (propName) { - case UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE: { - // Sub-group size is equivalent to warp size - int WarpSize = 0; - UR_CHECK_ERROR(cuDeviceGetAttribute( - &WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, hDevice->get())); - return ReturnValue(static_cast(WarpSize)); - } - case UR_KERNEL_SUB_GROUP_INFO_MAX_NUM_SUB_GROUPS: { - // Number of sub-groups = max block size / warp size + possible remainder - int MaxThreads = 0; - UR_CHECK_ERROR(cuFuncGetAttribute( - &MaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, hKernel->get())); - int WarpSize = 0; - urKernelGetSubGroupInfo(hKernel, hDevice, - UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE, - sizeof(uint32_t), &WarpSize, nullptr); - int MaxWarps = (MaxThreads + WarpSize - 1) / WarpSize; - return ReturnValue(static_cast(MaxWarps)); - } - case UR_KERNEL_SUB_GROUP_INFO_COMPILE_NUM_SUB_GROUPS: { - // Return value of 0 => not specified - // TODO: Revisit if PTX is generated for compile-time work-group sizes - return ReturnValue(0); - } - case UR_KERNEL_SUB_GROUP_INFO_SUB_GROUP_SIZE_INTEL: { - // Return value of 0 => unspecified or "auto" sub-group size - // Correct for now, since warp size may be read from special register - // TODO: Return warp size once default is primary sub-group size - // TODO: Revisit if we can recover [[sub_group_size]] attribute from PTX - return ReturnValue(0); - } - default: - break; - } - - return UR_RESULT_ERROR_INVALID_ENUMERATION; -} - -UR_APIEXPORT ur_result_t UR_APICALL -urKernelSetArgPointer(ur_kernel_handle_t hKernel, uint32_t argIndex, - const ur_kernel_arg_pointer_properties_t *pProperties, - const void *pArgValue) { - std::ignore = pProperties; - hKernel->setKernelArg(argIndex, sizeof(pArgValue), pArgValue); - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL -urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex, - const ur_kernel_arg_mem_obj_properties_t *Properties, - ur_mem_handle_t hArgValue) { - std::ignore = Properties; - - // Below sets kernel arg when zero-sized buffers are handled. - // In such case the corresponding memory is null. - if (hArgValue == nullptr) { - hKernel->setKernelArg(argIndex, 0, nullptr); - return UR_RESULT_SUCCESS; - } - - ur_result_t Result = UR_RESULT_SUCCESS; - try { - if (hArgValue->MemType == ur_mem_handle_t_::Type::Surface) { - CUDA_ARRAY3D_DESCRIPTOR arrayDesc; - UR_CHECK_ERROR(cuArray3DGetDescriptor( - &arrayDesc, hArgValue->Mem.SurfaceMem.getArray())); - if (arrayDesc.Format != CU_AD_FORMAT_UNSIGNED_INT32 && - arrayDesc.Format != CU_AD_FORMAT_SIGNED_INT32 && - arrayDesc.Format != CU_AD_FORMAT_HALF && - arrayDesc.Format != CU_AD_FORMAT_FLOAT) { - setErrorMessage("PI CUDA kernels only support images with channel " - "types int32, uint32, float, and half.", - UR_RESULT_ERROR_ADAPTER_SPECIFIC); - return UR_RESULT_ERROR_ADAPTER_SPECIFIC; - } - CUsurfObject CuSurf = hArgValue->Mem.SurfaceMem.getSurface(); - hKernel->setKernelArg(argIndex, sizeof(CuSurf), (void *)&CuSurf); - } else { - CUdeviceptr CuPtr = hArgValue->Mem.BufferMem.get(); - hKernel->setKernelArg(argIndex, sizeof(CUdeviceptr), (void *)&CuPtr); - } - } catch (ur_result_t Err) { - Result = Err; - } - return Result; -} - -// A NOP for the CUDA backend -UR_APIEXPORT ur_result_t UR_APICALL urKernelSetExecInfo( - ur_kernel_handle_t hKernel, ur_kernel_exec_info_t propName, size_t propSize, - const ur_kernel_exec_info_properties_t *pProperties, - const void *pPropValue) { - std::ignore = hKernel; - std::ignore = propSize; - std::ignore = pPropValue; - std::ignore = pProperties; - - switch (propName) { - case UR_KERNEL_EXEC_INFO_USM_INDIRECT_ACCESS: - case UR_KERNEL_EXEC_INFO_USM_PTRS: - case UR_KERNEL_EXEC_INFO_CACHE_CONFIG: - return UR_RESULT_SUCCESS; - default: - return UR_RESULT_ERROR_INVALID_ENUMERATION; - } -} - -UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle( - ur_native_handle_t hNativeKernel, ur_context_handle_t hContext, - ur_program_handle_t hProgram, - const ur_kernel_native_properties_t *pProperties, - ur_kernel_handle_t *phKernel) { - std::ignore = hNativeKernel; - std::ignore = hContext; - std::ignore = hProgram; - std::ignore = pProperties; - std::ignore = phKernel; - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -UR_APIEXPORT ur_result_t UR_APICALL -urKernelSetArgSampler(ur_kernel_handle_t hKernel, uint32_t argIndex, - const ur_kernel_arg_sampler_properties_t *pProperties, - ur_sampler_handle_t hArgValue) { - std::ignore = pProperties; - - ur_result_t Result = UR_RESULT_SUCCESS; - try { - uint32_t SamplerProps = hArgValue->Props; - hKernel->setKernelArg(argIndex, sizeof(uint32_t), (void *)&SamplerProps); - } catch (ur_result_t Err) { - Result = Err; - } - return Result; -} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp deleted file mode 100644 index ea4e565d3f44b..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp +++ /dev/null @@ -1,206 +0,0 @@ -//===--------- kernel.hpp - CUDA Adapter ----------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#pragma once - -#include -#include - -#include -#include -#include -#include - -#include "program.hpp" - -/// Implementation of a UR Kernel for CUDA -/// -/// UR Kernels are used to set kernel arguments, -/// creating a state on the Kernel object for a given -/// invocation. This is not the case of CUFunction objects, -/// which are simply passed together with the arguments on the invocation. -/// The UR Kernel implementation for CUDA stores the list of arguments, -/// argument sizes, and offsets to emulate the interface of UR Kernel, -/// saving the arguments for the later dispatch. -/// Note that in UR API, the Local memory is specified as a size per -/// individual argument, but in CUDA only the total usage of shared -/// memory is required since it is not passed as a parameter. -/// A compiler pass converts the UR API local memory model into the -/// CUDA shared model. This object simply calculates the total of -/// shared memory, and the initial offsets of each parameter. -struct ur_kernel_handle_t_ { - using native_type = CUfunction; - - native_type Function; - native_type FunctionWithOffsetParam; - std::string Name; - ur_context_handle_t Context; - ur_program_handle_t Program; - std::atomic_uint32_t RefCount; - - static constexpr uint32_t ReqdThreadsPerBlockDimensions = 3u; - size_t ReqdThreadsPerBlock[ReqdThreadsPerBlockDimensions]; - int RegsPerThread{0}; - - /// Structure that holds the arguments to the kernel. - /// Note each argument size is known, since it comes - /// from the kernel signature. - /// This is not something can be queried from the CUDA API - /// so there is a hard-coded size (\ref MAX_PARAM_BYTES) - /// and a storage. - struct arguments { - static constexpr size_t MaxParamBytes = 4000u; - using args_t = std::array; - using args_size_t = std::vector; - using args_index_t = std::vector; - args_t Storage; - args_size_t ParamSizes; - args_index_t Indices; - args_size_t OffsetPerIndex; - - std::uint32_t ImplicitOffsetArgs[3] = {0, 0, 0}; - - arguments() { - // Place the implicit offset index at the end of the indicies collection - Indices.emplace_back(&ImplicitOffsetArgs); - } - - /// Add an argument to the kernel. - /// If the argument existed before, it is replaced. - /// Otherwise, it is added. - /// Gaps are filled with empty arguments. - /// Implicit offset argument is kept at the back of the indices collection. - void addArg(size_t Index, size_t Size, const void *Arg, - size_t LocalSize = 0) { - if (Index + 2 > Indices.size()) { - // Move implicit offset argument index with the end - Indices.resize(Index + 2, Indices.back()); - // Ensure enough space for the new argument - ParamSizes.resize(Index + 1); - OffsetPerIndex.resize(Index + 1); - } - ParamSizes[Index] = Size; - // calculate the insertion point on the array - size_t InsertPos = std::accumulate(std::begin(ParamSizes), - std::begin(ParamSizes) + Index, 0); - // Update the stored value for the argument - std::memcpy(&Storage[InsertPos], Arg, Size); - Indices[Index] = &Storage[InsertPos]; - OffsetPerIndex[Index] = LocalSize; - } - - void addLocalArg(size_t Index, size_t Size) { - size_t LocalOffset = this->getLocalSize(); - - // maximum required alignment is the size of the largest vector type - const size_t MaxAlignment = sizeof(double) * 16; - - // for arguments smaller than the maximum alignment simply align to the - // size of the argument - const size_t Alignment = std::min(MaxAlignment, Size); - - // align the argument - size_t AlignedLocalOffset = LocalOffset; - size_t Pad = LocalOffset % Alignment; - if (Pad != 0) { - AlignedLocalOffset += Alignment - Pad; - } - - addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset), - Size + (AlignedLocalOffset - LocalOffset)); - } - - void setImplicitOffset(size_t Size, std::uint32_t *ImplicitOffset) { - assert(Size == sizeof(std::uint32_t) * 3); - std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size); - } - - void clearLocalSize() { - std::fill(std::begin(OffsetPerIndex), std::end(OffsetPerIndex), 0); - } - - const args_index_t &getIndices() const noexcept { return Indices; } - - uint32_t getLocalSize() const { - return std::accumulate(std::begin(OffsetPerIndex), - std::end(OffsetPerIndex), 0); - } - } Args; - - ur_kernel_handle_t_(CUfunction Func, CUfunction FuncWithOffsetParam, - const char *Name, ur_program_handle_t Program, - ur_context_handle_t Context) - : Function{Func}, FunctionWithOffsetParam{FuncWithOffsetParam}, - Name{Name}, Context{Context}, Program{Program}, RefCount{1} { - urProgramRetain(Program); - urContextRetain(Context); - /// Note: this code assumes that there is only one device per context - ur_result_t RetError = urKernelGetGroupInfo( - this, Context->getDevice(), - UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, - sizeof(ReqdThreadsPerBlock), ReqdThreadsPerBlock, nullptr); - (void)RetError; - assert(RetError == UR_RESULT_SUCCESS); - UR_CHECK_ERROR( - cuFuncGetAttribute(&RegsPerThread, CU_FUNC_ATTRIBUTE_NUM_REGS, Func)); - } - - ~ur_kernel_handle_t_() { - urProgramRelease(Program); - urContextRelease(Context); - } - - ur_program_handle_t get_program() const noexcept { return Program; } - - uint32_t incrementReferenceCount() noexcept { return ++RefCount; } - - uint32_t decrementReferenceCount() noexcept { return --RefCount; } - - uint32_t getReferenceCount() const noexcept { return RefCount; } - - native_type get() const noexcept { return Function; }; - - native_type get_with_offset_parameter() const noexcept { - return FunctionWithOffsetParam; - }; - - bool has_with_offset_parameter() const noexcept { - return FunctionWithOffsetParam != nullptr; - } - - ur_context_handle_t getContext() const noexcept { return Context; }; - - const char *getName() const noexcept { return Name.c_str(); } - - /// Get the number of kernel arguments, excluding the implicit global offset. - /// Note this only returns the current known number of arguments, not the - /// real one required by the kernel, since this cannot be queried from - /// the CUDA Driver API - size_t getNumArgs() const noexcept { return Args.Indices.size() - 1; } - - void setKernelArg(int Index, size_t Size, const void *Arg) { - Args.addArg(Index, Size, Arg); - } - - void setKernelLocalArg(int Index, size_t Size) { - Args.addLocalArg(Index, Size); - } - - void setImplicitOffsetArg(size_t Size, std::uint32_t *ImplicitOffset) { - return Args.setImplicitOffset(Size, ImplicitOffset); - } - - const arguments::args_index_t &getArgIndices() const { - return Args.getIndices(); - } - - uint32_t getLocalSize() const noexcept { return Args.getLocalSize(); } - - void clearLocalSize() { Args.clearLocalSize(); } - - size_t getRegsPerThread() const noexcept { return RegsPerThread; }; -}; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp deleted file mode 100644 index d51ba73d67e2a..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp +++ /dev/null @@ -1,479 +0,0 @@ -//===--------- memory.cpp - CUDA Adapter ----------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include - -#include "common.hpp" -#include "context.hpp" -#include "memory.hpp" - -/// Creates a UR Memory object using a CUDA memory allocation. -/// Can trigger a manual copy depending on the mode. -/// \TODO Implement USE_HOST_PTR using cuHostRegister - See #9789 -/// -UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( - ur_context_handle_t hContext, ur_mem_flags_t flags, size_t size, - const ur_buffer_properties_t *pProperties, ur_mem_handle_t *phBuffer) { - // Validate flags - if (flags & - (UR_MEM_FLAG_USE_HOST_POINTER | UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER)) { - UR_ASSERT(pProperties && pProperties->pHost, - UR_RESULT_ERROR_INVALID_HOST_PTR); - } - UR_ASSERT(size != 0, UR_RESULT_ERROR_INVALID_BUFFER_SIZE); - - // Currently, USE_HOST_PTR is not implemented using host register - // since this triggers a weird segfault after program ends. - // Setting this constant to true enables testing that behavior. - const bool EnableUseHostPtr = false; - const bool PerformInitialCopy = - (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) || - ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && !EnableUseHostPtr); - ur_result_t Result = UR_RESULT_SUCCESS; - ur_mem_handle_t MemObj = nullptr; - - try { - ScopedContext Active(hContext); - CUdeviceptr Ptr = 0; - auto HostPtr = pProperties ? pProperties->pHost : nullptr; - - ur_mem_handle_t_::MemImpl::BufferMem::AllocMode AllocMode = - ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::Classic; - - if ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && EnableUseHostPtr) { - UR_CHECK_ERROR( - cuMemHostRegister(HostPtr, size, CU_MEMHOSTREGISTER_DEVICEMAP)); - UR_CHECK_ERROR(cuMemHostGetDevicePointer(&Ptr, HostPtr, 0)); - AllocMode = ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::UseHostPtr; - } else if (flags & UR_MEM_FLAG_ALLOC_HOST_POINTER) { - UR_CHECK_ERROR(cuMemAllocHost(&HostPtr, size)); - UR_CHECK_ERROR(cuMemHostGetDevicePointer(&Ptr, HostPtr, 0)); - AllocMode = ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr; - } else { - UR_CHECK_ERROR(cuMemAlloc(&Ptr, size)); - if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) { - AllocMode = ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::CopyIn; - } - } - - ur_mem_handle_t parentBuffer = nullptr; - - auto URMemObj = std::unique_ptr(new ur_mem_handle_t_{ - hContext, parentBuffer, flags, AllocMode, Ptr, HostPtr, size}); - if (URMemObj != nullptr) { - MemObj = URMemObj.release(); - if (PerformInitialCopy) { - // Operates on the default stream of the current CUDA context. - UR_CHECK_ERROR(cuMemcpyHtoD(Ptr, HostPtr, size)); - // Synchronize with default stream implicitly used by cuMemcpyHtoD - // to make buffer data available on device before any other UR call - // uses it. - CUstream defaultStream = 0; - UR_CHECK_ERROR(cuStreamSynchronize(defaultStream)); - } - } else { - Result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; - } - } catch (ur_result_t Err) { - Result = Err; - } catch (...) { - Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; - } - - *phBuffer = MemObj; - - return Result; -} - -UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) { - UR_ASSERT(hMem->getReferenceCount() > 0, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - hMem->incrementReferenceCount(); - return UR_RESULT_SUCCESS; -} - -/// Decreases the reference count of the Mem object. -/// If this is zero, calls the relevant CUDA Free function -/// \return UR_RESULT_SUCCESS unless deallocation error -UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) { - ur_result_t Result = UR_RESULT_SUCCESS; - - try { - - // Do nothing if there are other references - if (hMem->decrementReferenceCount() > 0) { - return UR_RESULT_SUCCESS; - } - - // make sure hMem is released in case checkErrorUR throws - std::unique_ptr MemObjPtr(hMem); - - if (hMem->isSubBuffer()) { - return UR_RESULT_SUCCESS; - } - - ScopedContext Active(MemObjPtr->getContext()); - - if (hMem->MemType == ur_mem_handle_t_::Type::Buffer) { - switch (MemObjPtr->Mem.BufferMem.MemAllocMode) { - case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::CopyIn: - case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::Classic: - UR_CHECK_ERROR(cuMemFree(MemObjPtr->Mem.BufferMem.Ptr)); - break; - case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::UseHostPtr: - UR_CHECK_ERROR(cuMemHostUnregister(MemObjPtr->Mem.BufferMem.HostPtr)); - break; - case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr: - UR_CHECK_ERROR(cuMemFreeHost(MemObjPtr->Mem.BufferMem.HostPtr)); - }; - } else if (hMem->MemType == ur_mem_handle_t_::Type::Surface) { - UR_CHECK_ERROR( - cuSurfObjectDestroy(MemObjPtr->Mem.SurfaceMem.getSurface())); - UR_CHECK_ERROR(cuArrayDestroy(MemObjPtr->Mem.SurfaceMem.getArray())); - } - - } catch (ur_result_t Err) { - Result = Err; - } catch (...) { - Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; - } - - if (Result != UR_RESULT_SUCCESS) { - // A reported CUDA error is either an implementation or an asynchronous CUDA - // error for which it is unclear if the function that reported it succeeded - // or not. Either way, the state of the program is compromised and likely - // unrecoverable. - detail::ur::die("Unrecoverable program state reached in urMemRelease"); - } - - return UR_RESULT_SUCCESS; -} - -/// Gets the native CUDA handle of a UR mem object -/// -/// \param[in] hMem The UR mem to get the native CUDA object of. -/// \param[out] phNativeMem Set to the native handle of the UR mem object. -/// -/// \return UR_RESULT_SUCCESS -UR_APIEXPORT ur_result_t UR_APICALL -urMemGetNativeHandle(ur_mem_handle_t hMem, ur_native_handle_t *phNativeMem) { - *phNativeMem = - reinterpret_cast(hMem->Mem.BufferMem.get()); - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory, - ur_mem_info_t MemInfoType, - size_t propSize, - void *pMemInfo, - size_t *pPropSizeRet) { - UR_ASSERT(hMemory->isBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); - - UrReturnHelper ReturnValue(propSize, pMemInfo, pPropSizeRet); - - ScopedContext Active(hMemory->getContext()); - - switch (MemInfoType) { - case UR_MEM_INFO_SIZE: { - try { - size_t AllocSize = 0; - UR_CHECK_ERROR(cuMemGetAddressRange(nullptr, &AllocSize, - hMemory->Mem.BufferMem.Ptr)); - return ReturnValue(AllocSize); - } catch (ur_result_t Err) { - return Err; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } - } - case UR_MEM_INFO_CONTEXT: { - return ReturnValue(hMemory->getContext()); - } - - default: - return UR_RESULT_ERROR_INVALID_ENUMERATION; - } -} - -UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreateWithNativeHandle( - ur_native_handle_t, ur_context_handle_t, const ur_mem_native_properties_t *, - ur_mem_handle_t *) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreateWithNativeHandle( - ur_native_handle_t, ur_context_handle_t, const ur_image_format_t *, - const ur_image_desc_t *, const ur_mem_native_properties_t *, - ur_mem_handle_t *) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -/// \TODO Not implemented -UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( - ur_context_handle_t hContext, ur_mem_flags_t flags, - const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, - void *pHost, ur_mem_handle_t *phMem) { - if (flags & - (UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER | UR_MEM_FLAG_USE_HOST_POINTER)) { - UR_ASSERT(pHost, UR_RESULT_ERROR_INVALID_HOST_PTR); - } - const bool PerformInitialCopy = - (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) || - ((flags & UR_MEM_FLAG_USE_HOST_POINTER)); - - UR_ASSERT(pImageDesc->stype == UR_STRUCTURE_TYPE_IMAGE_DESC, - UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); - UR_ASSERT(pImageDesc->type <= UR_MEM_TYPE_IMAGE1D_BUFFER, - UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); - UR_ASSERT(pImageDesc->numMipLevel == 0, - UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); - UR_ASSERT(pImageDesc->numSamples == 0, - UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); - if (!pHost) { - UR_ASSERT(pImageDesc->rowPitch == 0, - UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); - UR_ASSERT(pImageDesc->slicePitch == 0, - UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); - } - - ur_result_t Result = UR_RESULT_SUCCESS; - - // We only support RBGA channel order - // TODO: check SYCL CTS and spec. May also have to support BGRA - UR_ASSERT(pImageFormat->channelOrder == UR_IMAGE_CHANNEL_ORDER_RGBA, - UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION); - - // We have to use cuArray3DCreate, which has some caveats. The height and - // depth parameters must be set to 0 produce 1D or 2D arrays. pImageDesc gives - // a minimum value of 1, so we need to convert the answer. - CUDA_ARRAY3D_DESCRIPTOR ArrayDesc; - ArrayDesc.NumChannels = 4; // Only support 4 channel image - ArrayDesc.Flags = 0; // No flags required - ArrayDesc.Width = pImageDesc->width; - if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) { - ArrayDesc.Height = 0; - ArrayDesc.Depth = 0; - } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) { - ArrayDesc.Height = pImageDesc->height; - ArrayDesc.Depth = 0; - } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) { - ArrayDesc.Height = pImageDesc->height; - ArrayDesc.Depth = pImageDesc->depth; - } - - // We need to get this now in bytes for calculating the total image size later - size_t PixelTypeSizeBytes; - - switch (pImageFormat->channelType) { - case UR_IMAGE_CHANNEL_TYPE_UNORM_INT8: - case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8: - ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT8; - PixelTypeSizeBytes = 1; - break; - case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8: - ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT8; - PixelTypeSizeBytes = 1; - break; - case UR_IMAGE_CHANNEL_TYPE_UNORM_INT16: - case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16: - ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT16; - PixelTypeSizeBytes = 2; - break; - case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16: - ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT16; - PixelTypeSizeBytes = 2; - break; - case UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT: - ArrayDesc.Format = CU_AD_FORMAT_HALF; - PixelTypeSizeBytes = 2; - break; - case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32: - ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT32; - PixelTypeSizeBytes = 4; - break; - case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32: - ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT32; - PixelTypeSizeBytes = 4; - break; - case UR_IMAGE_CHANNEL_TYPE_FLOAT: - ArrayDesc.Format = CU_AD_FORMAT_FLOAT; - PixelTypeSizeBytes = 4; - break; - default: - detail::ur::die( - "urMemImageCreate given unsupported image_channel_data_type"); - } - - // When a dimension isn't used pImageDesc has the size set to 1 - size_t PixelSizeBytes = - PixelTypeSizeBytes * 4; // 4 is the only number of channels we support - size_t ImageSizeBytes = PixelSizeBytes * pImageDesc->width * - pImageDesc->height * pImageDesc->depth; - - ScopedContext Active(hContext); - CUarray ImageArray = nullptr; - try { - UR_CHECK_ERROR(cuArray3DCreate(&ImageArray, &ArrayDesc)); - } catch (ur_result_t Err) { - if (Err == UR_RESULT_ERROR_INVALID_VALUE) { - return UR_RESULT_ERROR_INVALID_IMAGE_SIZE; - } - return Err; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } - - try { - if (PerformInitialCopy) { - // We have to use a different copy function for each image dimensionality - if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) { - UR_CHECK_ERROR(cuMemcpyHtoA(ImageArray, 0, pHost, ImageSizeBytes)); - } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) { - CUDA_MEMCPY2D CpyDesc; - memset(&CpyDesc, 0, sizeof(CpyDesc)); - CpyDesc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; - CpyDesc.srcHost = pHost; - CpyDesc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; - CpyDesc.dstArray = ImageArray; - CpyDesc.WidthInBytes = PixelSizeBytes * pImageDesc->width; - CpyDesc.Height = pImageDesc->height; - UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc)); - } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) { - CUDA_MEMCPY3D CpyDesc; - memset(&CpyDesc, 0, sizeof(CpyDesc)); - CpyDesc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; - CpyDesc.srcHost = pHost; - CpyDesc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; - CpyDesc.dstArray = ImageArray; - CpyDesc.WidthInBytes = PixelSizeBytes * pImageDesc->width; - CpyDesc.Height = pImageDesc->height; - CpyDesc.Depth = pImageDesc->depth; - UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc)); - } - } - - // CUDA_RESOURCE_DESC is a union of different structs, shown here - // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXOBJECT.html - // We need to fill it as described here to use it for a surface or texture - // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__SURFOBJECT.html - // CUDA_RESOURCE_DESC::resType must be CU_RESOURCE_TYPE_ARRAY and - // CUDA_RESOURCE_DESC::res::array::hArray must be set to a valid CUDA array - // handle. - // CUDA_RESOURCE_DESC::flags must be set to zero - - CUDA_RESOURCE_DESC ImageResDesc; - ImageResDesc.res.array.hArray = ImageArray; - ImageResDesc.resType = CU_RESOURCE_TYPE_ARRAY; - ImageResDesc.flags = 0; - - CUsurfObject Surface; - UR_CHECK_ERROR(cuSurfObjectCreate(&Surface, &ImageResDesc)); - - auto MemObj = std::unique_ptr(new ur_mem_handle_t_( - hContext, ImageArray, Surface, flags, pImageDesc->type, phMem)); - - if (MemObj == nullptr) { - return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; - } - - *phMem = MemObj.release(); - } catch (ur_result_t Err) { - if (ImageArray) { - cuArrayDestroy(ImageArray); - } - return Err; - } catch (...) { - if (ImageArray) { - cuArrayDestroy(ImageArray); - } - return UR_RESULT_ERROR_UNKNOWN; - } - - return Result; -} - -/// \TODO Not implemented -UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo(ur_mem_handle_t, - ur_image_info_t, size_t, - void *, size_t *) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -/// Implements a buffer partition in the CUDA backend. -/// A buffer partition (or a sub-buffer, in OpenCL terms) is simply implemented -/// as an offset over an existing CUDA allocation. -UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition( - ur_mem_handle_t hBuffer, ur_mem_flags_t flags, - ur_buffer_create_type_t bufferCreateType, const ur_buffer_region_t *pRegion, - ur_mem_handle_t *phMem) { - UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT((flags & UR_MEM_FLAGS_MASK) == 0, - UR_RESULT_ERROR_INVALID_ENUMERATION); - UR_ASSERT(hBuffer->isBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(!hBuffer->isSubBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); - - // Default value for flags means UR_MEM_FLAG_READ_WRITE. - if (flags == 0) { - flags = UR_MEM_FLAG_READ_WRITE; - } - - UR_ASSERT(!(flags & - (UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER | - UR_MEM_FLAG_ALLOC_HOST_POINTER | UR_MEM_FLAG_USE_HOST_POINTER)), - UR_RESULT_ERROR_INVALID_VALUE); - if (hBuffer->MemFlags & UR_MEM_FLAG_WRITE_ONLY) { - UR_ASSERT(!(flags & (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_READ_ONLY)), - UR_RESULT_ERROR_INVALID_VALUE); - } - if (hBuffer->MemFlags & UR_MEM_FLAG_READ_ONLY) { - UR_ASSERT(!(flags & (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)), - UR_RESULT_ERROR_INVALID_VALUE); - } - - UR_ASSERT(bufferCreateType == UR_BUFFER_CREATE_TYPE_REGION, - UR_RESULT_ERROR_INVALID_ENUMERATION); - UR_ASSERT(pRegion != nullptr, UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(phMem, UR_RESULT_ERROR_INVALID_NULL_POINTER); - - UR_ASSERT(pRegion->size != 0u, UR_RESULT_ERROR_INVALID_BUFFER_SIZE); - - assert((pRegion->origin <= (pRegion->origin + pRegion->size)) && "Overflow"); - UR_ASSERT( - ((pRegion->origin + pRegion->size) <= hBuffer->Mem.BufferMem.getSize()), - UR_RESULT_ERROR_INVALID_BUFFER_SIZE); - // Retained indirectly due to retaining parent buffer below. - ur_context_handle_t Context = hBuffer->Context; - - ur_mem_handle_t_::MemImpl::BufferMem::AllocMode AllocMode = - ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::Classic; - - assert(hBuffer->Mem.BufferMem.Ptr != - ur_mem_handle_t_::MemImpl::BufferMem::native_type{0}); - ur_mem_handle_t_::MemImpl::BufferMem::native_type Ptr = - hBuffer->Mem.BufferMem.Ptr + pRegion->origin; - - void *HostPtr = nullptr; - if (hBuffer->Mem.BufferMem.HostPtr) { - HostPtr = - static_cast(hBuffer->Mem.BufferMem.HostPtr) + pRegion->origin; - } - - std::unique_ptr MemObj{nullptr}; - try { - MemObj = std::unique_ptr{new ur_mem_handle_t_{ - Context, hBuffer, flags, AllocMode, Ptr, HostPtr, pRegion->size}}; - } catch (ur_result_t Err) { - *phMem = nullptr; - return Err; - } catch (...) { - *phMem = nullptr; - return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; - } - - *phMem = MemObj.release(); - return UR_RESULT_SUCCESS; -} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp deleted file mode 100644 index 33f8a3342f05d..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp +++ /dev/null @@ -1,232 +0,0 @@ -//===--------- memory.hpp - CUDA Adapter ----------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#pragma once - -#include -#include -#include - -#include "common.hpp" - -/// UR Mem mapping to CUDA memory allocations, both data and texture/surface. -/// \brief Represents non-SVM allocations on the CUDA backend. -/// Keeps tracks of all mapped regions used for Map/Unmap calls. -/// Only one region can be active at the same time per allocation. -struct ur_mem_handle_t_ { - // Context where the memory object is accessible - ur_context_handle_t Context; - - /// Reference counting of the handler - std::atomic_uint32_t RefCount; - enum class Type { Buffer, Surface, Texture } MemType; - - // Original mem flags passed - ur_mem_flags_t MemFlags; - - /// A UR Memory object represents either plain memory allocations ("Buffers" - /// in OpenCL) or typed allocations ("Images" in OpenCL). - /// In CUDA their API handlers are different. Whereas "Buffers" are allocated - /// as pointer-like structs, "Images" are stored in Textures or Surfaces. - /// This union allows implementation to use either from the same handler. - union MemImpl { - // Handler for plain, pointer-based CUDA allocations - struct BufferMem { - using native_type = CUdeviceptr; - - // If this allocation is a sub-buffer (i.e., a view on an existing - // allocation), this is the pointer to the parent handler structure - ur_mem_handle_t Parent; - // CUDA handler for the pointer - native_type Ptr; - - /// Pointer associated with this device on the host - void *HostPtr; - /// Size of the allocation in bytes - size_t Size; - /// Size of the active mapped region. - size_t MapSize; - /// Offset of the active mapped region. - size_t MapOffset; - /// Pointer to the active mapped region, if any - void *MapPtr; - /// Original flags for the mapped region - ur_map_flags_t MapFlags; - - /** AllocMode - * classic: Just a normal buffer allocated on the device via cuda malloc - * use_host_ptr: Use an address on the host for the device - * copy_in: The data for the device comes from the host but the host - pointer is not available later for re-use - * alloc_host_ptr: Uses pinned-memory allocation - */ - enum class AllocMode { - Classic, - UseHostPtr, - CopyIn, - AllocHostPtr, - } MemAllocMode; - - native_type get() const noexcept { return Ptr; } - - size_t getSize() const noexcept { return Size; } - - void *getMapPtr() const noexcept { return MapPtr; } - - size_t getMapSize() const noexcept { return MapSize; } - - size_t getMapOffset() const noexcept { return MapOffset; } - - /// Returns a pointer to data visible on the host that contains - /// the data on the device associated with this allocation. - /// The offset is used to index into the CUDA allocation. - void *mapToPtr(size_t Size, size_t Offset, - ur_map_flags_t Flags) noexcept { - assert(MapPtr == nullptr); - MapSize = Size; - MapOffset = Offset; - MapFlags = Flags; - if (HostPtr) { - MapPtr = static_cast(HostPtr) + Offset; - } else { - // TODO: Allocate only what is needed based on the offset - MapPtr = static_cast(malloc(this->getSize())); - } - return MapPtr; - } - - /// Detach the allocation from the host memory. - void unmap(void *) noexcept { - assert(MapPtr != nullptr); - - if (MapPtr != HostPtr) { - free(MapPtr); - } - MapPtr = nullptr; - MapSize = 0; - MapOffset = 0; - } - - ur_map_flags_t getMapFlags() const noexcept { - assert(MapPtr != nullptr); - return MapFlags; - } - } BufferMem; - - // Handler data for surface object (i.e. Images) - struct SurfaceMem { - CUarray Array; - CUsurfObject SurfObj; - ur_mem_type_t ImageType; - - CUarray getArray() const noexcept { return Array; } - - CUsurfObject getSurface() const noexcept { return SurfObj; } - - ur_mem_type_t getImageType() const noexcept { return ImageType; } - } SurfaceMem; - - struct ImageMem { - CUarray Array; - void *Handle; - ur_mem_type_t ImageType; - ur_sampler_handle_t Sampler; - - CUarray get_array() const noexcept { return Array; } - - void *get_handle() const noexcept { return Handle; } - - ur_mem_type_t get_image_type() const noexcept { return ImageType; } - - ur_sampler_handle_t get_sampler() const noexcept { return Sampler; } - } ImageMem; - } Mem; - - /// Constructs the UR mem handler for a non-typed allocation ("buffer") - ur_mem_handle_t_(ur_context_handle_t Context, ur_mem_handle_t Parent, - ur_mem_flags_t MemFlags, MemImpl::BufferMem::AllocMode Mode, - CUdeviceptr Ptr, void *HostPtr, size_t Size) - : Context{Context}, RefCount{1}, MemType{Type::Buffer}, - MemFlags{MemFlags} { - Mem.BufferMem.Ptr = Ptr; - Mem.BufferMem.Parent = Parent; - Mem.BufferMem.HostPtr = HostPtr; - Mem.BufferMem.Size = Size; - Mem.BufferMem.MapSize = 0; - Mem.BufferMem.MapOffset = 0; - Mem.BufferMem.MapPtr = nullptr; - Mem.BufferMem.MapFlags = UR_MAP_FLAG_WRITE; - Mem.BufferMem.MemAllocMode = Mode; - if (isSubBuffer()) { - urMemRetain(Mem.BufferMem.Parent); - } else { - urContextRetain(Context); - } - }; - - /// Constructs the UR allocation for an Image object (surface in CUDA) - ur_mem_handle_t_(ur_context_handle_t Context, CUarray Array, - CUsurfObject Surf, ur_mem_flags_t MemFlags, - ur_mem_type_t ImageType, void *HostPtr) - : Context{Context}, RefCount{1}, MemType{Type::Surface}, - MemFlags{MemFlags} { - (void)HostPtr; - - Mem.SurfaceMem.Array = Array; - Mem.SurfaceMem.SurfObj = Surf; - Mem.SurfaceMem.ImageType = ImageType; - urContextRetain(Context); - } - - /// Constructs the UR allocation for an unsampled image object - ur_mem_handle_t_(ur_context_handle_t Context, CUarray Array, - CUsurfObject Surf, ur_mem_type_t ImageType) - : Context{Context}, RefCount{1}, MemType{Type::Surface} { - - Mem.ImageMem.Array = Array; - Mem.ImageMem.Handle = (void *)Surf; - Mem.ImageMem.ImageType = ImageType; - Mem.ImageMem.Sampler = nullptr; - urContextRetain(Context); - } - - /// Constructs the UR allocation for a sampled image object - ur_mem_handle_t_(ur_context_handle_t Context, CUarray Array, CUtexObject Tex, - ur_sampler_handle_t Sampler, ur_mem_type_t ImageType) - : Context{Context}, RefCount{1}, MemType{Type::Texture} { - - Mem.ImageMem.Array = Array; - Mem.ImageMem.Handle = (void *)Tex; - Mem.ImageMem.ImageType = ImageType; - Mem.ImageMem.Sampler = Sampler; - urContextRetain(Context); - } - - ~ur_mem_handle_t_() { - if (isBuffer() && isSubBuffer()) { - urMemRelease(Mem.BufferMem.Parent); - return; - } - urContextRelease(Context); - } - - bool isBuffer() const noexcept { return MemType == Type::Buffer; } - - bool isSubBuffer() const noexcept { - return (isBuffer() && (Mem.BufferMem.Parent != nullptr)); - } - - bool isImage() const noexcept { return MemType == Type::Surface; } - - ur_context_handle_t getContext() const noexcept { return Context; } - - uint32_t incrementReferenceCount() noexcept { return ++RefCount; } - - uint32_t decrementReferenceCount() noexcept { return --RefCount; } - - uint32_t getReferenceCount() const noexcept { return RefCount; } -}; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp deleted file mode 100644 index 876f83921de23..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp +++ /dev/null @@ -1,195 +0,0 @@ -//===--------- platform.cpp - CUDA Adapter --------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "platform.hpp" -#include "common.hpp" -#include "context.hpp" -#include "device.hpp" - -#include -#include -#include - -UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetInfo( - ur_platform_handle_t hPlatform, ur_platform_info_t PlatformInfoType, - size_t Size, void *pPlatformInfo, size_t *pSizeRet) { - - UR_ASSERT(hPlatform, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UrReturnHelper ReturnValue(Size, pPlatformInfo, pSizeRet); - - switch (PlatformInfoType) { - case UR_PLATFORM_INFO_NAME: - return ReturnValue("NVIDIA CUDA BACKEND"); - case UR_PLATFORM_INFO_VENDOR_NAME: - return ReturnValue("NVIDIA Corporation"); - case UR_PLATFORM_INFO_PROFILE: - return ReturnValue("FULL PROFILE"); - case UR_PLATFORM_INFO_VERSION: { - auto Version = getCudaVersionString(); - return ReturnValue(Version.c_str()); - } - case UR_PLATFORM_INFO_EXTENSIONS: { - return ReturnValue(""); - } - case UR_PLATFORM_INFO_BACKEND: { - return ReturnValue(UR_PLATFORM_BACKEND_CUDA); - } - default: - return UR_RESULT_ERROR_INVALID_ENUMERATION; - } - - return UR_RESULT_SUCCESS; -} - -/// Obtains the CUDA platform. -/// There is only one CUDA platform, and contains all devices on the system. -/// Triggers the CUDA Driver initialization (cuInit) the first time, so this -/// must be the first PI API called. -/// -/// However because multiple devices in a context is not currently supported, -/// place each device in a separate platform. -UR_APIEXPORT ur_result_t UR_APICALL -urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries, - ur_platform_handle_t *phPlatforms, uint32_t *pNumPlatforms) { - - try { - static std::once_flag InitFlag; - static uint32_t NumPlatforms = 1; - static std::vector Platforms; - - UR_ASSERT(phPlatforms || pNumPlatforms, UR_RESULT_ERROR_INVALID_VALUE); - UR_ASSERT(!phPlatforms || NumEntries > 0, UR_RESULT_ERROR_INVALID_SIZE); - - ur_result_t Result = UR_RESULT_SUCCESS; - - std::call_once( - InitFlag, - [](ur_result_t &Result) { - UR_CHECK_ERROR(cuInit(0)); - int NumDevices = 0; - UR_CHECK_ERROR(cuDeviceGetCount(&NumDevices)); - try { - // make one platform per device - NumPlatforms = NumDevices; - Platforms.resize(NumDevices); - - for (int i = 0; i < NumDevices; ++i) { - CUdevice Device; - UR_CHECK_ERROR(cuDeviceGet(&Device, i)); - CUcontext Context; - UR_CHECK_ERROR(cuDevicePrimaryCtxRetain(&Context, Device)); - - ScopedContext active(Context); - CUevent EvBase; - UR_CHECK_ERROR(cuEventCreate(&EvBase, CU_EVENT_DEFAULT)); - - // Use default stream to record base event counter - UR_CHECK_ERROR(cuEventRecord(EvBase, 0)); - - Platforms[i].Devices.emplace_back(new ur_device_handle_t_{ - Device, Context, EvBase, &Platforms[i]}); - { - const auto &Dev = Platforms[i].Devices.back().get(); - size_t MaxWorkGroupSize = 0u; - size_t MaxThreadsPerBlock[3] = {}; - UR_CHECK_ERROR(urDeviceGetInfo( - Dev, UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES, - sizeof(MaxThreadsPerBlock), MaxThreadsPerBlock, nullptr)); - - UR_CHECK_ERROR(urDeviceGetInfo( - Dev, UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE, - sizeof(MaxWorkGroupSize), &MaxWorkGroupSize, nullptr)); - - Dev->saveMaxWorkItemSizes(sizeof(MaxThreadsPerBlock), - MaxThreadsPerBlock); - Dev->saveMaxWorkGroupSize(MaxWorkGroupSize); - } - } - } catch (const std::bad_alloc &) { - // Signal out-of-memory situation - for (int i = 0; i < NumDevices; ++i) { - Platforms[i].Devices.clear(); - } - Platforms.clear(); - Result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; - } catch (ur_result_t Err) { - // Clear and rethrow to allow retry - for (int i = 0; i < NumDevices; ++i) { - Platforms[i].Devices.clear(); - } - Platforms.clear(); - Result = Err; - throw Err; - } catch (...) { - Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; - throw; - } - }, - Result); - - if (pNumPlatforms != nullptr) { - *pNumPlatforms = NumPlatforms; - } - - if (phPlatforms != nullptr) { - for (unsigned i = 0; i < std::min(NumEntries, NumPlatforms); ++i) { - phPlatforms[i] = &Platforms[i]; - } - } - - return Result; - } catch (ur_result_t Err) { - return Err; - } catch (...) { - return UR_RESULT_ERROR_OUT_OF_RESOURCES; - } -} - -UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetApiVersion( - ur_platform_handle_t hDriver, ur_api_version_t *pVersion) { - std::ignore = hDriver; - *pVersion = UR_API_VERSION_CURRENT; - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetNativeHandle( - ur_platform_handle_t hPlatform, ur_native_handle_t *phNativePlatform) { - std::ignore = hPlatform; - std::ignore = phNativePlatform; - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -UR_APIEXPORT ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( - ur_native_handle_t hNativePlatform, - const ur_platform_native_properties_t *pProperties, - ur_platform_handle_t *phPlatform) { - std::ignore = hNativePlatform; - std::ignore = pProperties; - std::ignore = phPlatform; - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -// Get CUDA plugin specific backend option. -// Current support is only for optimization options. -// Return empty string for cuda. -// TODO: Determine correct string to be passed. -UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetBackendOption( - ur_platform_handle_t hPlatform, const char *pFrontendOption, - const char **ppPlatformOption) { - std::ignore = hPlatform; - using namespace std::literals; - if (pFrontendOption == nullptr) - return UR_RESULT_ERROR_INVALID_NULL_POINTER; - if (pFrontendOption == "-O0"sv || pFrontendOption == "-O1"sv || - pFrontendOption == "-O2"sv || pFrontendOption == "-O3"sv || - pFrontendOption == ""sv) { - *ppPlatformOption = ""; - return UR_RESULT_SUCCESS; - } - return UR_RESULT_ERROR_INVALID_VALUE; -} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.hpp deleted file mode 100644 index c9b6550610eb8..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.hpp +++ /dev/null @@ -1,15 +0,0 @@ -//===--------- platform.hpp - CUDA Adapter --------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#pragma once - -#include -#include - -struct ur_platform_handle_t_ { - std::vector> Devices; -}; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp deleted file mode 100644 index 7e238dd7fe22b..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp +++ /dev/null @@ -1,452 +0,0 @@ -//===--------- program.cpp - CUDA Adapter ---------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "program.hpp" - -bool getMaxRegistersJitOptionValue(const std::string &BuildOptions, - unsigned int &Value) { - using namespace std::string_view_literals; - const std::size_t OptionPos = BuildOptions.find_first_of("maxrregcount"sv); - if (OptionPos == std::string::npos) { - return false; - } - - const std::size_t DelimPos = BuildOptions.find('=', OptionPos + 1u); - if (DelimPos == std::string::npos) { - return false; - } - - const std::size_t Length = BuildOptions.length(); - const std::size_t StartPos = DelimPos + 1u; - if (DelimPos == std::string::npos || StartPos >= Length) { - return false; - } - - std::size_t Pos = StartPos; - while (Pos < Length && - std::isdigit(static_cast(BuildOptions[Pos]))) { - Pos++; - } - - const std::string ValueString = BuildOptions.substr(StartPos, Pos - StartPos); - if (ValueString.empty()) { - return false; - } - - Value = static_cast(std::stoi(ValueString)); - return true; -} - -ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t Context) - : Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1}, - Context{Context}, KernelReqdWorkGroupSizeMD{} { - urContextRetain(Context); -} - -ur_program_handle_t_::~ur_program_handle_t_() { urContextRelease(Context); } - -std::pair -splitMetadataName(const std::string &metadataName) { - size_t splitPos = metadataName.rfind('@'); - if (splitPos == std::string::npos) - return std::make_pair(metadataName, std::string{}); - return std::make_pair(metadataName.substr(0, splitPos), - metadataName.substr(splitPos, metadataName.length())); -} - -ur_result_t -ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata, - size_t Length) { - for (size_t i = 0; i < Length; ++i) { - const ur_program_metadata_t MetadataElement = Metadata[i]; - std::string MetadataElementName{MetadataElement.pName}; - - auto [Prefix, Tag] = splitMetadataName(MetadataElementName); - - if (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE) { - // If metadata is reqd_work_group_size, record it for the corresponding - // kernel name. - size_t MDElemsSize = MetadataElement.size - sizeof(std::uint64_t); - - // Expect between 1 and 3 32-bit integer values. - UR_ASSERT(MDElemsSize >= sizeof(std::uint32_t) && - MDElemsSize <= sizeof(std::uint32_t) * 3, - UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); - - // Get pointer to data, skipping 64-bit size at the start of the data. - const char *ValuePtr = - reinterpret_cast(MetadataElement.value.pData) + - sizeof(std::uint64_t); - // Read values and pad with 1's for values not present. - std::uint32_t ReqdWorkGroupElements[] = {1, 1, 1}; - std::memcpy(ReqdWorkGroupElements, ValuePtr, MDElemsSize); - KernelReqdWorkGroupSizeMD[Prefix] = - std::make_tuple(ReqdWorkGroupElements[0], ReqdWorkGroupElements[1], - ReqdWorkGroupElements[2]); - } else if (Tag == __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING) { - const char *MetadataValPtr = - reinterpret_cast(MetadataElement.value.pData) + - sizeof(std::uint64_t); - const char *MetadataValPtrEnd = - MetadataValPtr + MetadataElement.size - sizeof(std::uint64_t); - GlobalIDMD[Prefix] = std::string{MetadataValPtr, MetadataValPtrEnd}; - } - } - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_program_handle_t_::setBinary(const char *Source, size_t Length) { - // Do not re-set program binary data which has already been set as that will - // delete the old binary data. - UR_ASSERT(Binary == nullptr && BinarySizeInBytes == 0, - UR_RESULT_ERROR_INVALID_OPERATION); - Binary = Source; - BinarySizeInBytes = Length; - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_program_handle_t_::buildProgram(const char *BuildOptions) { - if (BuildOptions) { - this->BuildOptions = BuildOptions; - } - - constexpr const unsigned int NumberOfOptions = 4u; - - std::vector Options(NumberOfOptions); - std::vector OptionVals(NumberOfOptions); - - // Pass a buffer for info messages - Options[0] = CU_JIT_INFO_LOG_BUFFER; - OptionVals[0] = (void *)InfoLog; - // Pass the size of the info buffer - Options[1] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; - OptionVals[1] = (void *)(long)MaxLogSize; - // Pass a buffer for error message - Options[2] = CU_JIT_ERROR_LOG_BUFFER; - OptionVals[2] = (void *)ErrorLog; - // Pass the size of the error buffer - Options[3] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; - OptionVals[3] = (void *)(long)MaxLogSize; - - if (!this->BuildOptions.empty()) { - unsigned int MaxRegs; - bool Valid = getMaxRegistersJitOptionValue(BuildOptions, MaxRegs); - if (Valid) { - Options.push_back(CU_JIT_MAX_REGISTERS); - OptionVals.push_back(reinterpret_cast(MaxRegs)); - } - } - - UR_CHECK_ERROR(cuModuleLoadDataEx(&Module, static_cast(Binary), - Options.size(), Options.data(), - OptionVals.data())); - - BuildStatus = UR_PROGRAM_BUILD_STATUS_SUCCESS; - - // If no exception, result is correct - return UR_RESULT_SUCCESS; -} - -/// Finds kernel names by searching for entry points in the PTX source, as the -/// CUDA driver API doesn't expose an operation for this. -/// Note: This is currently only being used by the SYCL program class for the -/// has_kernel method, so an alternative would be to move the has_kernel -/// query to UR and use cuModuleGetFunction to check for a kernel. -/// Note: Another alternative is to add kernel names as metadata, like with -/// reqd_work_group_size. -ur_result_t getKernelNames(ur_program_handle_t) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -/// CUDA will handle the PTX/CUBIN binaries internally through CUmodule object. -/// So, urProgramCreateWithIL and urProgramCreateWithBinary are equivalent in -/// terms of CUDA adapter. See \ref urProgramCreateWithBinary. -UR_APIEXPORT ur_result_t UR_APICALL -urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL, - size_t length, const ur_program_properties_t *pProperties, - ur_program_handle_t *phProgram) { - ur_device_handle_t hDevice = hContext->getDevice(); - auto pBinary = reinterpret_cast(pIL); - - return urProgramCreateWithBinary(hContext, hDevice, length, pBinary, - pProperties, phProgram); -} - -/// CUDA will handle the PTX/CUBIN binaries internally through a call to -/// cuModuleLoadDataEx. So, urProgramCompile and urProgramBuild are equivalent -/// in terms of CUDA adapter. \TODO Implement asynchronous compilation -UR_APIEXPORT ur_result_t UR_APICALL -urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram, - const char *pOptions) { - return urProgramBuild(hContext, hProgram, pOptions); -} - -/// Loads the images from a UR program into a CUmodule that can be -/// used later on to extract functions (kernels). -/// See \ref ur_program_handle_t for implementation details. -UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext, - ur_program_handle_t hProgram, - const char *pOptions) { - std::ignore = hContext; - - ur_result_t Result = UR_RESULT_SUCCESS; - - try { - ScopedContext Active(hProgram->getContext()); - - hProgram->buildProgram(pOptions); - - } catch (ur_result_t Err) { - Result = Err; - } - return Result; -} - -/// Creates a new UR program object that is the outcome of linking all input -/// programs. -/// \TODO Implement linker options, requires mapping of OpenCL to CUDA -UR_APIEXPORT ur_result_t UR_APICALL -urProgramLink(ur_context_handle_t hContext, uint32_t count, - const ur_program_handle_t *phPrograms, const char *pOptions, - ur_program_handle_t *phProgram) { - ur_result_t Result = UR_RESULT_SUCCESS; - - try { - ScopedContext Active(hContext); - - CUlinkState State; - std::unique_ptr RetProgram{ - new ur_program_handle_t_{hContext}}; - - UR_CHECK_ERROR(cuLinkCreate(0, nullptr, nullptr, &State)); - try { - for (size_t i = 0; i < count; ++i) { - ur_program_handle_t Program = phPrograms[i]; - UR_CHECK_ERROR(cuLinkAddData( - State, CU_JIT_INPUT_PTX, const_cast(Program->Binary), - Program->BinarySizeInBytes, nullptr, 0, nullptr, nullptr)); - } - void *CuBin = nullptr; - size_t CuBinSize = 0; - UR_CHECK_ERROR(cuLinkComplete(State, &CuBin, &CuBinSize)); - - Result = - RetProgram->setBinary(static_cast(CuBin), CuBinSize); - - Result = RetProgram->buildProgram(pOptions); - } catch (...) { - // Upon error attempt cleanup - UR_CHECK_ERROR(cuLinkDestroy(State)); - throw; - } - - UR_CHECK_ERROR(cuLinkDestroy(State)); - *phProgram = RetProgram.release(); - - } catch (ur_result_t Err) { - Result = Err; - } - return Result; -} - -/// Created a UR program object from a CUDA program handle. -/// TODO: Implement this. -/// NOTE: The created UR object takes ownership of the native handle. -/// -/// \param[in] nativeHandle The native handle to create UR program object from. -/// \param[in] context The UR context of the program. -/// \param[out] program Set to the UR program object created from native handle. -/// -/// \return TBD -UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithNativeHandle( - ur_native_handle_t, ur_context_handle_t, - const ur_program_native_properties_t *, ur_program_handle_t *) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -UR_APIEXPORT ur_result_t UR_APICALL -urProgramGetBuildInfo(ur_program_handle_t hProgram, ur_device_handle_t hDevice, - ur_program_build_info_t propName, size_t propSize, - void *pPropValue, size_t *pPropSizeRet) { - std::ignore = hDevice; - - UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); - - switch (propName) { - case UR_PROGRAM_BUILD_INFO_STATUS: { - return ReturnValue(hProgram->BuildStatus); - } - case UR_PROGRAM_BUILD_INFO_OPTIONS: - return ReturnValue(hProgram->BuildOptions.c_str()); - case UR_PROGRAM_BUILD_INFO_LOG: - return ReturnValue(hProgram->InfoLog, hProgram->MaxLogSize); - default: - break; - } - return UR_RESULT_ERROR_INVALID_ENUMERATION; -} - -UR_APIEXPORT ur_result_t UR_APICALL -urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName, - size_t propSize, void *pProgramInfo, size_t *pPropSizeRet) { - UrReturnHelper ReturnValue(propSize, pProgramInfo, pPropSizeRet); - - switch (propName) { - case UR_PROGRAM_INFO_REFERENCE_COUNT: - return ReturnValue(hProgram->getReferenceCount()); - case UR_PROGRAM_INFO_CONTEXT: - return ReturnValue(hProgram->Context); - case UR_PROGRAM_INFO_NUM_DEVICES: - return ReturnValue(1u); - case UR_PROGRAM_INFO_DEVICES: - return ReturnValue(&hProgram->Context->DeviceID, 1); - case UR_PROGRAM_INFO_SOURCE: - return ReturnValue(hProgram->Binary); - case UR_PROGRAM_INFO_BINARY_SIZES: - return ReturnValue(&hProgram->BinarySizeInBytes, 1); - case UR_PROGRAM_INFO_BINARIES: - return ReturnValue(&hProgram->Binary, 1); - case UR_PROGRAM_INFO_KERNEL_NAMES: - /* TODO: Add implementation for getKernelNames */ - UR_ASSERT(getKernelNames(hProgram), UR_RESULT_ERROR_UNSUPPORTED_FEATURE); - return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; - case UR_PROGRAM_INFO_NUM_KERNELS: - return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; - default: - break; - } - return UR_RESULT_ERROR_INVALID_ENUMERATION; -} - -UR_APIEXPORT ur_result_t UR_APICALL -urProgramRetain(ur_program_handle_t hProgram) { - UR_ASSERT(hProgram->getReferenceCount() > 0, UR_RESULT_ERROR_INVALID_PROGRAM); - hProgram->incrementReferenceCount(); - return UR_RESULT_SUCCESS; -} - -/// Decreases the reference count of a ur_program_handle_t object. -/// When the reference count reaches 0, it unloads the module from -/// the context. -UR_APIEXPORT ur_result_t UR_APICALL -urProgramRelease(ur_program_handle_t hProgram) { - // double delete or someone is messing with the ref count. - // either way, cannot safely proceed. - UR_ASSERT(hProgram->getReferenceCount() != 0, - UR_RESULT_ERROR_INVALID_PROGRAM); - - // decrement ref count. If it is 0, delete the program. - if (hProgram->decrementReferenceCount() == 0) { - - std::unique_ptr ProgramPtr{hProgram}; - - ur_result_t Result = UR_RESULT_ERROR_INVALID_PROGRAM; - - try { - ScopedContext Active(hProgram->getContext()); - auto cuModule = hProgram->get(); - // "0" is a valid handle for a cuModule, so the best way to check if we - // actually loaded a module and need to unload it is to look at the build - // status. - if (hProgram->BuildStatus == UR_PROGRAM_BUILD_STATUS_SUCCESS) { - UR_CHECK_ERROR(cuModuleUnload(cuModule)); - Result = UR_RESULT_SUCCESS; - } else if (hProgram->BuildStatus == UR_PROGRAM_BUILD_STATUS_NONE) { - // Nothing to free. - Result = UR_RESULT_SUCCESS; - } - } catch (...) { - Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; - } - - return Result; - } - - return UR_RESULT_SUCCESS; -} - -/// Gets the native CUDA handle of a UR program object -/// -/// \param[in] program The UR program handle to get the native CUDA object of. -/// \param[out] nativeHandle Set to the native handle of the UR program object. -/// -/// \return ur_result_t -UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle( - ur_program_handle_t hProgram, ur_native_handle_t *nativeHandle) { - *nativeHandle = reinterpret_cast(hProgram->get()); - return UR_RESULT_SUCCESS; -} - -/// Loads images from a list of PTX or CUBIN binaries. -/// Note: No calls to CUDA driver API in this function, only store binaries -/// for later. -/// -/// Note: Only supports one device -/// -UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( - ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size, - const uint8_t *pBinary, const ur_program_properties_t *pProperties, - ur_program_handle_t *phProgram) { - UR_ASSERT(hContext->getDevice()->get() == hDevice->get(), - UR_RESULT_ERROR_INVALID_CONTEXT); - UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE); - - ur_result_t Result = UR_RESULT_SUCCESS; - - std::unique_ptr RetProgram{ - new ur_program_handle_t_{hContext}}; - - if (pProperties) { - if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) { - return UR_RESULT_ERROR_INVALID_NULL_POINTER; - } else if (pProperties->count == 0 && pProperties->pMetadatas != nullptr) { - return UR_RESULT_ERROR_INVALID_SIZE; - } - Result = - RetProgram->setMetadata(pProperties->pMetadatas, pProperties->count); - } - UR_ASSERT(Result == UR_RESULT_SUCCESS, Result); - - auto pBinary_string = reinterpret_cast(pBinary); - - Result = RetProgram->setBinary(pBinary_string, size); - UR_ASSERT(Result == UR_RESULT_SUCCESS, Result); - - *phProgram = RetProgram.release(); - - return Result; -} - -// This entry point is only used for native specialization constants (SPIR-V), -// and the CUDA plugin is AOT only so this entry point is not supported. -UR_APIEXPORT ur_result_t UR_APICALL urProgramSetSpecializationConstants( - ur_program_handle_t, uint32_t, const ur_specialization_constant_info_t *) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -UR_APIEXPORT ur_result_t UR_APICALL urProgramGetFunctionPointer( - ur_device_handle_t hDevice, ur_program_handle_t hProgram, - const char *pFunctionName, void **ppFunctionPointer) { - // Check if device passed is the same the device bound to the context - UR_ASSERT(hDevice == hProgram->getContext()->getDevice(), - UR_RESULT_ERROR_INVALID_DEVICE); - - CUfunction Func; - CUresult Ret = cuModuleGetFunction(&Func, hProgram->get(), pFunctionName); - *ppFunctionPointer = Func; - ur_result_t Result = UR_RESULT_SUCCESS; - - if (Ret != CUDA_SUCCESS && Ret != CUDA_ERROR_NOT_FOUND) - UR_CHECK_ERROR(Ret); - if (Ret == CUDA_ERROR_NOT_FOUND) { - *ppFunctionPointer = 0; - Result = UR_RESULT_ERROR_INVALID_FUNCTION_NAME; - } - - return Result; -} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.hpp deleted file mode 100644 index 99ed9a3862917..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.hpp +++ /dev/null @@ -1,54 +0,0 @@ -//===--------- program.hpp - CUDA Adapter ---------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#pragma once - -#include -#include - -#include -#include - -#include "context.hpp" - -struct ur_program_handle_t_ { - using native_type = CUmodule; - native_type Module; - const char *Binary; - size_t BinarySizeInBytes; - std::atomic_uint32_t RefCount; - ur_context_handle_t Context; - - // Metadata - std::unordered_map> - KernelReqdWorkGroupSizeMD; - std::unordered_map GlobalIDMD; - - constexpr static size_t MaxLogSize = 8192u; - - char ErrorLog[MaxLogSize], InfoLog[MaxLogSize]; - std::string BuildOptions; - ur_program_build_status_t BuildStatus = UR_PROGRAM_BUILD_STATUS_NONE; - - ur_program_handle_t_(ur_context_handle_t Context); - ~ur_program_handle_t_(); - - ur_result_t setMetadata(const ur_program_metadata_t *Metadata, size_t Length); - - ur_result_t setBinary(const char *Binary, size_t BinarySizeInBytes); - - ur_result_t buildProgram(const char *BuildOptions); - ur_context_handle_t getContext() const { return Context; }; - - native_type get() const noexcept { return Module; }; - - uint32_t incrementReferenceCount() noexcept { return ++RefCount; } - - uint32_t decrementReferenceCount() noexcept { return --RefCount; } - - uint32_t getReferenceCount() const noexcept { return RefCount; } -}; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp deleted file mode 100644 index 2a3d18994991c..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp +++ /dev/null @@ -1,328 +0,0 @@ -//===--------- queue.cpp - CUDA Adapter -----------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "queue.hpp" -#include "common.hpp" -#include "context.hpp" -#include "event.hpp" - -#include -#include - -void ur_queue_handle_t_::computeStreamWaitForBarrierIfNeeded(CUstream Stream, - uint32_t StreamI) { - if (BarrierEvent && !ComputeAppliedBarrier[StreamI]) { - UR_CHECK_ERROR(cuStreamWaitEvent(Stream, BarrierEvent, 0)); - ComputeAppliedBarrier[StreamI] = true; - } -} - -void ur_queue_handle_t_::transferStreamWaitForBarrierIfNeeded( - CUstream Stream, uint32_t StreamI) { - if (BarrierEvent && !TransferAppliedBarrier[StreamI]) { - UR_CHECK_ERROR(cuStreamWaitEvent(Stream, BarrierEvent, 0)); - TransferAppliedBarrier[StreamI] = true; - } -} - -CUstream ur_queue_handle_t_::getNextComputeStream(uint32_t *StreamToken) { - uint32_t StreamI; - uint32_t Token; - while (true) { - if (NumComputeStreams < ComputeStreams.size()) { - // the check above is for performance - so as not to lock mutex every time - std::lock_guard guard(ComputeStreamMutex); - // The second check is done after mutex is locked so other threads can not - // change NumComputeStreams after that - if (NumComputeStreams < ComputeStreams.size()) { - UR_CHECK_ERROR(cuStreamCreateWithPriority( - &ComputeStreams[NumComputeStreams++], Flags, Priority)); - } - } - Token = ComputeStreamIndex++; - StreamI = Token % ComputeStreams.size(); - // if a stream has been reused before it was next selected round-robin - // fashion, we want to delay its next use and instead select another one - // that is more likely to have completed all the enqueued work. - if (DelayCompute[StreamI]) { - DelayCompute[StreamI] = false; - } else { - break; - } - } - if (StreamToken) { - *StreamToken = Token; - } - CUstream res = ComputeStreams[StreamI]; - computeStreamWaitForBarrierIfNeeded(res, StreamI); - return res; -} - -CUstream ur_queue_handle_t_::getNextComputeStream( - uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, - ur_stream_guard_ &Guard, uint32_t *StreamToken) { - for (uint32_t i = 0; i < NumEventsInWaitList; i++) { - uint32_t Token = EventWaitList[i]->getComputeStreamToken(); - if (reinterpret_cast(EventWaitList[i]->getQueue()) == - this && - canReuseStream(Token)) { - std::unique_lock ComputeSyncGuard(ComputeStreamSyncMutex); - // redo the check after lock to avoid data races on - // LastSyncComputeStreams - if (canReuseStream(Token)) { - uint32_t StreamI = Token % DelayCompute.size(); - DelayCompute[StreamI] = true; - if (StreamToken) { - *StreamToken = Token; - } - Guard = ur_stream_guard_{std::move(ComputeSyncGuard)}; - CUstream Result = EventWaitList[i]->getStream(); - computeStreamWaitForBarrierIfNeeded(Result, StreamI); - return Result; - } - } - } - Guard = {}; - return getNextComputeStream(StreamToken); -} - -CUstream ur_queue_handle_t_::getNextTransferStream() { - if (TransferStreams.empty()) { // for example in in-order queue - return getNextComputeStream(); - } - if (NumTransferStreams < TransferStreams.size()) { - // the check above is for performance - so as not to lock mutex every time - std::lock_guard Guuard(TransferStreamMutex); - // The second check is done after mutex is locked so other threads can not - // change NumTransferStreams after that - if (NumTransferStreams < TransferStreams.size()) { - UR_CHECK_ERROR(cuStreamCreateWithPriority( - &TransferStreams[NumTransferStreams++], Flags, Priority)); - } - } - uint32_t StreamI = TransferStreamIndex++ % TransferStreams.size(); - CUstream Result = TransferStreams[StreamI]; - transferStreamWaitForBarrierIfNeeded(Result, StreamI); - return Result; -} - -/// Creates a `ur_queue_handle_t` object on the CUDA backend. -/// Valid properties -/// * __SYCL_PI_CUDA_USE_DEFAULT_STREAM -> CU_STREAM_DEFAULT -/// * __SYCL_PI_CUDA_SYNC_WITH_DEFAULT -> CU_STREAM_NON_BLOCKING -UR_APIEXPORT ur_result_t UR_APICALL -urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice, - const ur_queue_properties_t *pProps, ur_queue_handle_t *phQueue) { - try { - std::unique_ptr Queue{nullptr}; - - if (hContext->getDevice() != hDevice) { - *phQueue = nullptr; - return UR_RESULT_ERROR_INVALID_DEVICE; - } - - unsigned int Flags = CU_STREAM_NON_BLOCKING; - ur_queue_flags_t URFlags = 0; - // '0' is the default priority, per CUDA Toolkit 12.2 and earlier - int Priority = 0; - bool IsOutOfOrder = false; - if (pProps && pProps->stype == UR_STRUCTURE_TYPE_QUEUE_PROPERTIES) { - URFlags = pProps->flags; - if (URFlags == UR_QUEUE_FLAG_USE_DEFAULT_STREAM) { - Flags = CU_STREAM_DEFAULT; - } else if (URFlags == UR_QUEUE_FLAG_SYNC_WITH_DEFAULT_STREAM) { - Flags = 0; - } - - if (URFlags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) { - IsOutOfOrder = true; - } - if (URFlags & UR_QUEUE_FLAG_PRIORITY_HIGH) { - ScopedContext Active(hContext); - UR_CHECK_ERROR(cuCtxGetStreamPriorityRange(nullptr, &Priority)); - } else if (URFlags & UR_QUEUE_FLAG_PRIORITY_LOW) { - ScopedContext Active(hContext); - UR_CHECK_ERROR(cuCtxGetStreamPriorityRange(&Priority, nullptr)); - } - } - - std::vector ComputeCuStreams( - IsOutOfOrder ? ur_queue_handle_t_::DefaultNumComputeStreams : 1); - std::vector TransferCuStreams( - IsOutOfOrder ? ur_queue_handle_t_::DefaultNumTransferStreams : 0); - - Queue = std::unique_ptr(new ur_queue_handle_t_{ - std::move(ComputeCuStreams), std::move(TransferCuStreams), hContext, - hDevice, Flags, URFlags, Priority}); - - *phQueue = Queue.release(); - - return UR_RESULT_SUCCESS; - } catch (ur_result_t Err) { - - return Err; - - } catch (...) { - - return UR_RESULT_ERROR_OUT_OF_RESOURCES; - } -} - -UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(ur_queue_handle_t hQueue) { - assert(hQueue->getReferenceCount() > 0); - - hQueue->incrementReferenceCount(); - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) { - if (hQueue->decrementReferenceCount() > 0) { - return UR_RESULT_SUCCESS; - } - - try { - std::unique_ptr Queue(hQueue); - - if (!hQueue->backendHasOwnership()) - return UR_RESULT_SUCCESS; - - ScopedContext Active(hQueue->getContext()); - - hQueue->forEachStream([](CUstream S) { - UR_CHECK_ERROR(cuStreamSynchronize(S)); - UR_CHECK_ERROR(cuStreamDestroy(S)); - }); - - return UR_RESULT_SUCCESS; - } catch (ur_result_t Err) { - return Err; - } catch (...) { - return UR_RESULT_ERROR_OUT_OF_RESOURCES; - } -} - -UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) { - ur_result_t Result = UR_RESULT_SUCCESS; - - try { - ScopedContext active(hQueue->getContext()); - - hQueue->syncStreams( - [](CUstream s) { UR_CHECK_ERROR(cuStreamSynchronize(s)); }); - - } catch (ur_result_t Err) { - - Result = Err; - - } catch (...) { - - Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; - } - - return Result; -} - -// There is no CUDA counterpart for queue flushing and we don't run into the -// same problem of having to flush cross-queue dependencies as some of the -// other plugins, so it can be left as no-op. -UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t hQueue) { - std::ignore = hQueue; - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL -urQueueGetNativeHandle(ur_queue_handle_t hQueue, ur_queue_native_desc_t *pDesc, - ur_native_handle_t *phNativeQueue) { - std::ignore = pDesc; - - ScopedContext Active(hQueue->getContext()); - *phNativeQueue = - reinterpret_cast(hQueue->getNextComputeStream()); - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( - ur_native_handle_t hNativeQueue, ur_context_handle_t hContext, - ur_device_handle_t hDevice, const ur_queue_native_properties_t *pProperties, - ur_queue_handle_t *phQueue) { - (void)hDevice; - - unsigned int CuFlags; - CUstream CuStream = reinterpret_cast(hNativeQueue); - - UR_CHECK_ERROR(cuStreamGetFlags(CuStream, &CuFlags)); - - ur_queue_flags_t Flags = 0; - if (CuFlags == CU_STREAM_DEFAULT) - Flags = UR_QUEUE_FLAG_USE_DEFAULT_STREAM; - else if (CuFlags == CU_STREAM_NON_BLOCKING) - Flags = UR_QUEUE_FLAG_SYNC_WITH_DEFAULT_STREAM; - else - detail::ur::die("Unknown cuda stream"); - - std::vector ComputeCuStreams(1, CuStream); - std::vector TransferCuStreams(0); - - // Create queue and set num_compute_streams to 1, as computeCuStreams has - // valid stream - *phQueue = - new ur_queue_handle_t_{std::move(ComputeCuStreams), - std::move(TransferCuStreams), - hContext, - hContext->getDevice(), - CuFlags, - Flags, - /*backend_owns*/ pProperties->isNativeHandleOwned}; - (*phQueue)->NumComputeStreams = 1; - - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue, - ur_queue_info_t propName, - size_t propValueSize, - void *pPropValue, - size_t *pPropSizeRet) { - UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropSizeRet); - - switch (propName) { - case UR_QUEUE_INFO_CONTEXT: - return ReturnValue(hQueue->Context); - case UR_QUEUE_INFO_DEVICE: - return ReturnValue(hQueue->Device); - case UR_QUEUE_INFO_REFERENCE_COUNT: - return ReturnValue(hQueue->getReferenceCount()); - case UR_QUEUE_INFO_FLAGS: - return ReturnValue(hQueue->URFlags); - case UR_QUEUE_INFO_EMPTY: { - try { - bool IsReady = hQueue->allOf([](CUstream S) -> bool { - const CUresult Ret = cuStreamQuery(S); - if (Ret == CUDA_SUCCESS) - return true; - - if (Ret == CUDA_ERROR_NOT_READY) - return false; - - UR_CHECK_ERROR(Ret); - return false; - }); - return ReturnValue(IsReady); - } catch (ur_result_t Err) { - return Err; - } catch (...) { - return UR_RESULT_ERROR_OUT_OF_RESOURCES; - } - } - case UR_QUEUE_INFO_DEVICE_DEFAULT: - case UR_QUEUE_INFO_SIZE: - return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; - default: - return UR_RESULT_ERROR_INVALID_ENUMERATION; - } -} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp deleted file mode 100644 index 4f2721b13aed6..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp +++ /dev/null @@ -1,246 +0,0 @@ -//===--------- queue.hpp - CUDA Adapter -----------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#pragma once - -#include - -#include -#include -#include - -using ur_stream_guard_ = std::unique_lock; - -/// UR queue mapping on to CUstream objects. -/// -struct ur_queue_handle_t_ { - - using native_type = CUstream; - static constexpr int DefaultNumComputeStreams = 128; - static constexpr int DefaultNumTransferStreams = 64; - - std::vector ComputeStreams; - std::vector TransferStreams; - // delay_compute_ keeps track of which streams have been recently reused and - // their next use should be delayed. If a stream has been recently reused it - // will be skipped the next time it would be selected round-robin style. When - // skipped, its delay flag is cleared. - std::vector DelayCompute; - // keep track of which streams have applied barrier - std::vector ComputeAppliedBarrier; - std::vector TransferAppliedBarrier; - ur_context_handle_t_ *Context; - ur_device_handle_t_ *Device; - CUevent BarrierEvent = nullptr; - CUevent BarrierTmpEvent = nullptr; - std::atomic_uint32_t RefCount; - std::atomic_uint32_t EventCount; - std::atomic_uint32_t ComputeStreamIndex; - std::atomic_uint32_t TransferStreamIndex; - unsigned int NumComputeStreams; - unsigned int NumTransferStreams; - unsigned int LastSyncComputeStreams; - unsigned int LastSyncTransferStreams; - unsigned int Flags; - ur_queue_flags_t URFlags; - int Priority; - // When ComputeStreamSyncMutex and ComputeStreamMutex both need to be - // locked at the same time, ComputeStreamSyncMutex should be locked first - // to avoid deadlocks - std::mutex ComputeStreamSyncMutex; - std::mutex ComputeStreamMutex; - std::mutex TransferStreamMutex; - std::mutex BarrierMutex; - bool HasOwnership; - - ur_queue_handle_t_(std::vector &&ComputeStreams, - std::vector &&TransferStreams, - ur_context_handle_t_ *Context, ur_device_handle_t_ *Device, - unsigned int Flags, ur_queue_flags_t URFlags, int Priority, - bool BackendOwns = true) - : ComputeStreams{std::move(ComputeStreams)}, - TransferStreams{std::move(TransferStreams)}, - DelayCompute(this->ComputeStreams.size(), false), - ComputeAppliedBarrier(this->ComputeStreams.size()), - TransferAppliedBarrier(this->TransferStreams.size()), Context{Context}, - Device{Device}, RefCount{1}, EventCount{0}, ComputeStreamIndex{0}, - TransferStreamIndex{0}, NumComputeStreams{0}, NumTransferStreams{0}, - LastSyncComputeStreams{0}, LastSyncTransferStreams{0}, Flags(Flags), - URFlags(URFlags), Priority(Priority), HasOwnership{BackendOwns} { - urContextRetain(Context); - urDeviceRetain(Device); - } - - ~ur_queue_handle_t_() { - urContextRelease(Context); - urDeviceRelease(Device); - } - - void computeStreamWaitForBarrierIfNeeded(CUstream Strean, uint32_t StreamI); - void transferStreamWaitForBarrierIfNeeded(CUstream Stream, uint32_t StreamI); - - // get_next_compute/transfer_stream() functions return streams from - // appropriate pools in round-robin fashion - native_type getNextComputeStream(uint32_t *StreamToken = nullptr); - // this overload tries select a stream that was used by one of dependencies. - // If that is not possible returns a new stream. If a stream is reused it - // returns a lock that needs to remain locked as long as the stream is in use - native_type getNextComputeStream(uint32_t NumEventsInWaitList, - const ur_event_handle_t *EventWaitList, - ur_stream_guard_ &Guard, - uint32_t *StreamToken = nullptr); - native_type getNextTransferStream(); - native_type get() { return getNextComputeStream(); }; - - bool hasBeenSynchronized(uint32_t StreamToken) { - // stream token not associated with one of the compute streams - if (StreamToken == std::numeric_limits::max()) { - return false; - } - return LastSyncComputeStreams > StreamToken; - } - - bool canReuseStream(uint32_t StreamToken) { - // stream token not associated with one of the compute streams - if (StreamToken == std::numeric_limits::max()) { - return false; - } - // If the command represented by the stream token was not the last command - // enqueued to the stream we can not reuse the stream - we need to allow for - // commands enqueued after it and the one we are about to enqueue to run - // concurrently - bool IsLastCommand = - (ComputeStreamIndex - StreamToken) <= ComputeStreams.size(); - // If there was a barrier enqueued to the queue after the command - // represented by the stream token we should not reuse the stream, as we can - // not take that stream into account for the bookkeeping for the next - // barrier - such a stream would not be synchronized with. Performance-wise - // it does not matter that we do not reuse the stream, as the work - // represented by the stream token is guaranteed to be complete by the - // barrier before any work we are about to enqueue to the stream will start, - // so the event does not need to be synchronized with. - return IsLastCommand && !hasBeenSynchronized(StreamToken); - } - - template bool allOf(T &&F) { - { - std::lock_guard ComputeGuard(ComputeStreamMutex); - unsigned int End = std::min( - static_cast(ComputeStreams.size()), NumComputeStreams); - if (!std::all_of(ComputeStreams.begin(), ComputeStreams.begin() + End, F)) - return false; - } - { - std::lock_guard TransferGuard(TransferStreamMutex); - unsigned int End = - std::min(static_cast(TransferStreams.size()), - NumTransferStreams); - if (!std::all_of(TransferStreams.begin(), TransferStreams.begin() + End, - F)) - return false; - } - return true; - } - - template void forEachStream(T &&F) { - { - std::lock_guard compute_guard(ComputeStreamMutex); - unsigned int End = std::min( - static_cast(ComputeStreams.size()), NumComputeStreams); - for (unsigned int i = 0; i < End; i++) { - F(ComputeStreams[i]); - } - } - { - std::lock_guard transfer_guard(TransferStreamMutex); - unsigned int End = - std::min(static_cast(TransferStreams.size()), - NumTransferStreams); - for (unsigned int i = 0; i < End; i++) { - F(TransferStreams[i]); - } - } - } - - template void syncStreams(T &&F) { - auto SyncCompute = [&F, &Streams = ComputeStreams, &Delay = DelayCompute]( - unsigned int Start, unsigned int Stop) { - for (unsigned int i = Start; i < Stop; i++) { - F(Streams[i]); - Delay[i] = false; - } - }; - auto SyncTransfer = [&F, &streams = TransferStreams](unsigned int Start, - unsigned int Stop) { - for (unsigned int i = Start; i < Stop; i++) { - F(streams[i]); - } - }; - { - unsigned int Size = static_cast(ComputeStreams.size()); - std::lock_guard ComputeSyncGuard(ComputeStreamSyncMutex); - std::lock_guard ComputeGuard(ComputeStreamMutex); - unsigned int Start = LastSyncComputeStreams; - unsigned int End = NumComputeStreams < Size ? NumComputeStreams - : ComputeStreamIndex.load(); - if (ResetUsed) { - LastSyncComputeStreams = End; - } - if (End - Start >= Size) { - SyncCompute(0, Size); - } else { - Start %= Size; - End %= Size; - if (Start <= End) { - SyncCompute(Start, End); - } else { - SyncCompute(Start, Size); - SyncCompute(0, End); - } - } - } - { - unsigned int Size = static_cast(TransferStreams.size()); - if (!Size) { - return; - } - std::lock_guard TransferGuard(TransferStreamMutex); - unsigned int Start = LastSyncTransferStreams; - unsigned int End = NumTransferStreams < Size ? NumTransferStreams - : TransferStreamIndex.load(); - if (ResetUsed) { - LastSyncTransferStreams = End; - } - if (End - Start >= Size) { - SyncTransfer(0, Size); - } else { - Start %= Size; - End %= Size; - if (Start <= End) { - SyncTransfer(Start, End); - } else { - SyncTransfer(Start, Size); - SyncTransfer(0, End); - } - } - } - } - - ur_context_handle_t_ *getContext() const { return Context; }; - - ur_device_handle_t_ *get_device() const { return Device; }; - - uint32_t incrementReferenceCount() noexcept { return ++RefCount; } - - uint32_t decrementReferenceCount() noexcept { return --RefCount; } - - uint32_t getReferenceCount() const noexcept { return RefCount; } - - uint32_t getNextEventID() noexcept { return ++EventCount; } - - bool backendHasOwnership() const noexcept { return HasOwnership; } -}; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp deleted file mode 100644 index e561f4902b1d5..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp +++ /dev/null @@ -1,106 +0,0 @@ -//===--------- sampler.cpp - CUDA Adapter ---------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "sampler.hpp" -#include "common.hpp" - -UR_APIEXPORT ur_result_t UR_APICALL -urSamplerCreate(ur_context_handle_t hContext, const ur_sampler_desc_t *pDesc, - ur_sampler_handle_t *phSampler) { - std::unique_ptr Sampler{ - new ur_sampler_handle_t_(hContext)}; - - if (pDesc && pDesc->stype == UR_STRUCTURE_TYPE_SAMPLER_DESC) { - Sampler->Props |= pDesc->normalizedCoords; - Sampler->Props |= pDesc->filterMode << 1; - Sampler->Props |= pDesc->addressingMode << 2; - } else { - // Set default values - Sampler->Props |= true; // Normalized Coords - Sampler->Props |= UR_SAMPLER_ADDRESSING_MODE_CLAMP << 2; - } - - void *pNext = const_cast(pDesc->pNext); - while (pNext != nullptr) { - const ur_base_desc_t *BaseDesc = - reinterpret_cast(pNext); - if (BaseDesc->stype == UR_STRUCTURE_TYPE_EXP_SAMPLER_MIP_PROPERTIES) { - const ur_exp_sampler_mip_properties_t *SamplerMipProperties = - reinterpret_cast(pNext); - Sampler->MaxMipmapLevelClamp = SamplerMipProperties->maxMipmapLevelClamp; - Sampler->MinMipmapLevelClamp = SamplerMipProperties->minMipmapLevelClamp; - Sampler->MaxAnisotropy = SamplerMipProperties->maxAnisotropy; - Sampler->Props |= SamplerMipProperties->mipFilterMode << 5; - } - pNext = const_cast(BaseDesc->pNext); - } - - *phSampler = Sampler.release(); - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL -urSamplerGetInfo(ur_sampler_handle_t hSampler, ur_sampler_info_t propName, - size_t propValueSize, void *pPropValue, size_t *pPropSizeRet) { - UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropSizeRet); - - switch (propName) { - case UR_SAMPLER_INFO_REFERENCE_COUNT: - return ReturnValue(hSampler->getReferenceCount()); - case UR_SAMPLER_INFO_CONTEXT: - return ReturnValue(hSampler->Context); - case UR_SAMPLER_INFO_NORMALIZED_COORDS: { - bool NormCoordsProp = hSampler->isNormalizedCoords(); - return ReturnValue(NormCoordsProp); - } - case UR_SAMPLER_INFO_FILTER_MODE: { - ur_sampler_filter_mode_t FilterProp = hSampler->getFilterMode(); - return ReturnValue(FilterProp); - } - case UR_SAMPLER_INFO_ADDRESSING_MODE: { - ur_sampler_addressing_mode_t AddressingProp = hSampler->getAddressingMode(); - return ReturnValue(AddressingProp); - } - default: - return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; - } - return {}; -} - -UR_APIEXPORT ur_result_t UR_APICALL -urSamplerRetain(ur_sampler_handle_t hSampler) { - hSampler->incrementReferenceCount(); - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL -urSamplerRelease(ur_sampler_handle_t hSampler) { - // double delete or someone is messing with the ref count. - // either way, cannot safely proceed. - detail::ur::assertion( - hSampler->getReferenceCount() != 0, - "Reference count overflow detected in urSamplerRelease."); - - // decrement ref count. If it is 0, delete the sampler. - if (hSampler->decrementReferenceCount() == 0) { - delete hSampler; - } - - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL -urSamplerGetNativeHandle(ur_sampler_handle_t, ur_native_handle_t *) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -UR_APIEXPORT ur_result_t UR_APICALL urSamplerCreateWithNativeHandle( - ur_native_handle_t, ur_context_handle_t, - const ur_sampler_native_properties_t *, ur_sampler_handle_t *) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.hpp deleted file mode 100644 index 8c362b98c9e80..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.hpp +++ /dev/null @@ -1,54 +0,0 @@ -//===--------- sampler.hpp - CUDA Adapter ---------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include - -/// Implementation of samplers for CUDA -/// -/// Sampler property layout: -/// | | -/// ----------------------------------- -/// | 31 30 ... 6 | N/A -/// | 5 | mip filter mode -/// | 4 3 2 | addressing mode -/// | 1 | filter mode -/// | 0 | normalize coords -struct ur_sampler_handle_t_ { - std::atomic_uint32_t RefCount; - uint32_t Props; - float MinMipmapLevelClamp; - float MaxMipmapLevelClamp; - float MaxAnisotropy; - ur_context_handle_t Context; - - ur_sampler_handle_t_(ur_context_handle_t Context) - : RefCount(1), Props(0), MinMipmapLevelClamp(0.0f), - MaxMipmapLevelClamp(0.0f), MaxAnisotropy(0.0f), Context(Context) {} - - uint32_t incrementReferenceCount() noexcept { return ++RefCount; } - - uint32_t decrementReferenceCount() noexcept { return --RefCount; } - - uint32_t getReferenceCount() const noexcept { return RefCount; } - - ur_bool_t isNormalizedCoords() const noexcept { - return static_cast(Props & 0b1); - } - - ur_sampler_filter_mode_t getFilterMode() const noexcept { - return static_cast((Props >> 1) & 0b1); - } - - ur_sampler_addressing_mode_t getAddressingMode() const noexcept { - return static_cast((Props >> 2) & 0b111); - } - - ur_sampler_filter_mode_t getMipFilterMode() const noexcept { - return static_cast((Props >> 5) & 0b1); - } -}; diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/tracing.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/tracing.cpp deleted file mode 100644 index 9c0183960eebb..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/tracing.cpp +++ /dev/null @@ -1,109 +0,0 @@ -//===-------------- tracing.cpp - CUDA Host API Tracing --------------------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifdef XPTI_ENABLE_INSTRUMENTATION -#include -#include -#endif - -#include -#ifdef XPTI_ENABLE_INSTRUMENTATION -#include -#endif // XPTI_ENABLE_INSTRUMENTATION - -#include -#include - -#ifdef XPTI_ENABLE_INSTRUMENTATION -constexpr auto CUDA_CALL_STREAM_NAME = "sycl.experimental.cuda.call"; -constexpr auto CUDA_DEBUG_STREAM_NAME = "sycl.experimental.cuda.debug"; - -thread_local uint64_t CallCorrelationID = 0; -thread_local uint64_t DebugCorrelationID = 0; - -static xpti_td *GCallEvent = nullptr; -static xpti_td *GDebugEvent = nullptr; - -constexpr auto GVerStr = "0.1"; -constexpr int GMajVer = 0; -constexpr int GMinVer = 1; - -static void cuptiCallback(void *, CUpti_CallbackDomain, CUpti_CallbackId CBID, - const void *CBData) { - if (xptiTraceEnabled()) { - const auto *CBInfo = static_cast(CBData); - - if (CBInfo->callbackSite == CUPTI_API_ENTER) { - CallCorrelationID = xptiGetUniqueId(); - DebugCorrelationID = xptiGetUniqueId(); - } - - const char *FuncName = CBInfo->functionName; - uint32_t FuncID = static_cast(CBID); - uint16_t TraceTypeArgs = CBInfo->callbackSite == CUPTI_API_ENTER - ? xpti::trace_function_with_args_begin - : xpti::trace_function_with_args_end; - uint16_t TraceType = CBInfo->callbackSite == CUPTI_API_ENTER - ? xpti::trace_function_begin - : xpti::trace_function_end; - - uint8_t CallStreamID = xptiRegisterStream(CUDA_CALL_STREAM_NAME); - uint8_t DebugStreamID = xptiRegisterStream(CUDA_DEBUG_STREAM_NAME); - - xptiNotifySubscribers(CallStreamID, TraceType, GCallEvent, nullptr, - CallCorrelationID, FuncName); - - xpti::function_with_args_t Payload{ - FuncID, FuncName, const_cast(CBInfo->functionParams), - CBInfo->functionReturnValue, CBInfo->context}; - xptiNotifySubscribers(DebugStreamID, TraceTypeArgs, GDebugEvent, nullptr, - DebugCorrelationID, &Payload); - } -} -#endif - -void enableCUDATracing() { -#ifdef XPTI_ENABLE_INSTRUMENTATION - if (!xptiTraceEnabled()) - return; - - xptiRegisterStream(CUDA_CALL_STREAM_NAME); - xptiInitialize(CUDA_CALL_STREAM_NAME, GMajVer, GMinVer, GVerStr); - xptiRegisterStream(CUDA_DEBUG_STREAM_NAME); - xptiInitialize(CUDA_DEBUG_STREAM_NAME, GMajVer, GMinVer, GVerStr); - - uint64_t Dummy; - xpti::payload_t CUDAPayload("CUDA Plugin Layer"); - GCallEvent = - xptiMakeEvent("CUDA Plugin Layer", &CUDAPayload, - xpti::trace_algorithm_event, xpti_at::active, &Dummy); - - xpti::payload_t CUDADebugPayload("CUDA Plugin Debug Layer"); - GDebugEvent = - xptiMakeEvent("CUDA Plugin Debug Layer", &CUDADebugPayload, - xpti::trace_algorithm_event, xpti_at::active, &Dummy); - - CUpti_SubscriberHandle Subscriber; - cuptiSubscribe(&Subscriber, cuptiCallback, nullptr); - cuptiEnableDomain(1, Subscriber, CUPTI_CB_DOMAIN_DRIVER_API); - cuptiEnableCallback(0, Subscriber, CUPTI_CB_DOMAIN_DRIVER_API, - CUPTI_DRIVER_TRACE_CBID_cuGetErrorString); - cuptiEnableCallback(0, Subscriber, CUPTI_CB_DOMAIN_DRIVER_API, - CUPTI_DRIVER_TRACE_CBID_cuGetErrorName); -#endif -} - -void disableCUDATracing() { -#ifdef XPTI_ENABLE_INSTRUMENTATION - if (!xptiTraceEnabled()) - return; - - xptiFinalize(CUDA_CALL_STREAM_NAME); - xptiFinalize(CUDA_DEBUG_STREAM_NAME); -#endif // XPTI_ENABLE_INSTRUMENTATION -} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp deleted file mode 100644 index 73eace5818dfd..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp +++ /dev/null @@ -1,355 +0,0 @@ -//===--------- ur_interface_loader.cpp - Unified Runtime -----------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include -#include - -namespace { - -// TODO - this is a duplicate of what is in the L0 plugin -// We should move this to somewhere common -ur_result_t validateProcInputs(ur_api_version_t version, void *pDdiTable) { - if (pDdiTable == nullptr) { - return UR_RESULT_ERROR_INVALID_NULL_POINTER; - } - // Pre 1.0 we enforce that loader and adapter must have the same version. - // Post 1.0 only a major version match should be required. - if (version != UR_API_VERSION_CURRENT) { - return UR_RESULT_ERROR_UNSUPPORTED_VERSION; - } - return UR_RESULT_SUCCESS; -} -} // namespace - -#if defined(__cplusplus) -extern "C" { -#endif - -UR_DLLEXPORT ur_result_t UR_APICALL urGetPlatformProcAddrTable( - ur_api_version_t version, ur_platform_dditable_t *pDdiTable) { - auto result = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != result) { - return result; - } - pDdiTable->pfnCreateWithNativeHandle = nullptr; - pDdiTable->pfnGet = urPlatformGet; - pDdiTable->pfnGetApiVersion = urPlatformGetApiVersion; - pDdiTable->pfnGetInfo = urPlatformGetInfo; - pDdiTable->pfnGetNativeHandle = urPlatformGetNativeHandle; - pDdiTable->pfnGetBackendOption = urPlatformGetBackendOption; - return UR_RESULT_SUCCESS; -} - -UR_DLLEXPORT ur_result_t UR_APICALL urGetContextProcAddrTable( - ur_api_version_t version, ur_context_dditable_t *pDdiTable) { - auto result = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != result) { - return result; - } - pDdiTable->pfnCreate = urContextCreate; - pDdiTable->pfnCreateWithNativeHandle = urContextCreateWithNativeHandle; - pDdiTable->pfnGetInfo = urContextGetInfo; - pDdiTable->pfnGetNativeHandle = urContextGetNativeHandle; - pDdiTable->pfnRelease = urContextRelease; - pDdiTable->pfnRetain = urContextRetain; - pDdiTable->pfnSetExtendedDeleter = urContextSetExtendedDeleter; - return UR_RESULT_SUCCESS; -} - -UR_DLLEXPORT ur_result_t UR_APICALL urGetEventProcAddrTable( - ur_api_version_t version, ur_event_dditable_t *pDdiTable) { - auto result = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != result) { - return result; - } - pDdiTable->pfnCreateWithNativeHandle = urEventCreateWithNativeHandle; - pDdiTable->pfnGetInfo = urEventGetInfo; - pDdiTable->pfnGetNativeHandle = urEventGetNativeHandle; - pDdiTable->pfnGetProfilingInfo = urEventGetProfilingInfo; - pDdiTable->pfnRelease = urEventRelease; - pDdiTable->pfnRetain = urEventRetain; - pDdiTable->pfnSetCallback = urEventSetCallback; - pDdiTable->pfnWait = urEventWait; - return UR_RESULT_SUCCESS; -} - -UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramProcAddrTable( - ur_api_version_t version, ur_program_dditable_t *pDdiTable) { - auto result = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != result) { - return result; - } - pDdiTable->pfnBuild = urProgramBuild; - pDdiTable->pfnCompile = urProgramCompile; - pDdiTable->pfnCreateWithBinary = urProgramCreateWithBinary; - pDdiTable->pfnCreateWithIL = urProgramCreateWithIL; - pDdiTable->pfnCreateWithNativeHandle = urProgramCreateWithNativeHandle; - pDdiTable->pfnGetBuildInfo = urProgramGetBuildInfo; - pDdiTable->pfnGetFunctionPointer = urProgramGetFunctionPointer; - pDdiTable->pfnGetInfo = urProgramGetInfo; - pDdiTable->pfnGetNativeHandle = urProgramGetNativeHandle; - pDdiTable->pfnLink = urProgramLink; - pDdiTable->pfnRelease = urProgramRelease; - pDdiTable->pfnRetain = urProgramRetain; - pDdiTable->pfnSetSpecializationConstants = - urProgramSetSpecializationConstants; - return UR_RESULT_SUCCESS; -} - -UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( - ur_api_version_t version, ur_kernel_dditable_t *pDdiTable) { - auto result = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != result) { - return result; - } - pDdiTable->pfnCreate = urKernelCreate; - pDdiTable->pfnCreateWithNativeHandle = urKernelCreateWithNativeHandle; - pDdiTable->pfnGetGroupInfo = urKernelGetGroupInfo; - pDdiTable->pfnGetInfo = urKernelGetInfo; - pDdiTable->pfnGetNativeHandle = urKernelGetNativeHandle; - pDdiTable->pfnGetSubGroupInfo = urKernelGetSubGroupInfo; - pDdiTable->pfnRelease = urKernelRelease; - pDdiTable->pfnRetain = urKernelRetain; - pDdiTable->pfnSetArgLocal = urKernelSetArgLocal; - pDdiTable->pfnSetArgMemObj = urKernelSetArgMemObj; - pDdiTable->pfnSetArgPointer = urKernelSetArgPointer; - pDdiTable->pfnSetArgSampler = urKernelSetArgSampler; - pDdiTable->pfnSetArgValue = urKernelSetArgValue; - pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; - pDdiTable->pfnSetSpecializationConstants = nullptr; - return UR_RESULT_SUCCESS; -} - -UR_DLLEXPORT ur_result_t UR_APICALL urGetSamplerProcAddrTable( - ur_api_version_t version, ur_sampler_dditable_t *pDdiTable) { - auto result = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != result) { - return result; - } - pDdiTable->pfnCreate = urSamplerCreate; - pDdiTable->pfnCreateWithNativeHandle = urSamplerCreateWithNativeHandle; - pDdiTable->pfnGetInfo = urSamplerGetInfo; - pDdiTable->pfnGetNativeHandle = urSamplerGetNativeHandle; - pDdiTable->pfnRelease = urSamplerRelease; - pDdiTable->pfnRetain = urSamplerRetain; - return UR_RESULT_SUCCESS; -} - -UR_DLLEXPORT ur_result_t UR_APICALL -urGetMemProcAddrTable(ur_api_version_t version, ur_mem_dditable_t *pDdiTable) { - auto result = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != result) { - return result; - } - pDdiTable->pfnBufferCreate = urMemBufferCreate; - pDdiTable->pfnBufferPartition = urMemBufferPartition; - pDdiTable->pfnBufferCreateWithNativeHandle = - urMemBufferCreateWithNativeHandle; - pDdiTable->pfnImageCreateWithNativeHandle = urMemImageCreateWithNativeHandle; - pDdiTable->pfnGetInfo = urMemGetInfo; - pDdiTable->pfnGetNativeHandle = urMemGetNativeHandle; - pDdiTable->pfnImageCreate = urMemImageCreate; - pDdiTable->pfnImageGetInfo = urMemImageGetInfo; - pDdiTable->pfnRelease = urMemRelease; - pDdiTable->pfnRetain = urMemRetain; - return UR_RESULT_SUCCESS; -} - -UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable( - ur_api_version_t version, ur_enqueue_dditable_t *pDdiTable) { - auto result = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != result) { - return result; - } - pDdiTable->pfnDeviceGlobalVariableRead = urEnqueueDeviceGlobalVariableRead; - pDdiTable->pfnDeviceGlobalVariableWrite = urEnqueueDeviceGlobalVariableWrite; - pDdiTable->pfnEventsWait = urEnqueueEventsWait; - pDdiTable->pfnEventsWaitWithBarrier = urEnqueueEventsWaitWithBarrier; - pDdiTable->pfnKernelLaunch = urEnqueueKernelLaunch; - pDdiTable->pfnMemBufferCopy = urEnqueueMemBufferCopy; - pDdiTable->pfnMemBufferCopyRect = urEnqueueMemBufferCopyRect; - pDdiTable->pfnMemBufferFill = urEnqueueMemBufferFill; - pDdiTable->pfnMemBufferMap = urEnqueueMemBufferMap; - pDdiTable->pfnMemBufferRead = urEnqueueMemBufferRead; - pDdiTable->pfnMemBufferReadRect = urEnqueueMemBufferReadRect; - pDdiTable->pfnMemBufferWrite = urEnqueueMemBufferWrite; - pDdiTable->pfnMemBufferWriteRect = urEnqueueMemBufferWriteRect; - pDdiTable->pfnMemImageCopy = urEnqueueMemImageCopy; - pDdiTable->pfnMemImageRead = urEnqueueMemImageRead; - pDdiTable->pfnMemImageWrite = urEnqueueMemImageWrite; - pDdiTable->pfnMemUnmap = urEnqueueMemUnmap; - pDdiTable->pfnUSMFill2D = urEnqueueUSMFill2D; - pDdiTable->pfnUSMFill = urEnqueueUSMFill; - pDdiTable->pfnUSMAdvise = urEnqueueUSMAdvise; - pDdiTable->pfnUSMMemcpy2D = urEnqueueUSMMemcpy2D; - pDdiTable->pfnUSMMemcpy = urEnqueueUSMMemcpy; - pDdiTable->pfnUSMPrefetch = urEnqueueUSMPrefetch; - pDdiTable->pfnReadHostPipe = urEnqueueReadHostPipe; - pDdiTable->pfnWriteHostPipe = urEnqueueWriteHostPipe; - return UR_RESULT_SUCCESS; -} - -UR_DLLEXPORT ur_result_t UR_APICALL urGetGlobalProcAddrTable( - ur_api_version_t version, ur_global_dditable_t *pDdiTable) { - auto result = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != result) { - return result; - } - pDdiTable->pfnInit = urInit; - pDdiTable->pfnTearDown = urTearDown; - pDdiTable->pfnAdapterGet = urAdapterGet; - pDdiTable->pfnAdapterRelease = urAdapterRelease; - pDdiTable->pfnAdapterRetain = urAdapterRetain; - pDdiTable->pfnAdapterGetLastError = urAdapterGetLastError; - pDdiTable->pfnAdapterGetInfo = urAdapterGetInfo; - - return UR_RESULT_SUCCESS; -} - -UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( - ur_api_version_t version, ur_queue_dditable_t *pDdiTable) { - auto result = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != result) { - return result; - } - pDdiTable->pfnCreate = urQueueCreate; - pDdiTable->pfnCreateWithNativeHandle = urQueueCreateWithNativeHandle; - pDdiTable->pfnFinish = urQueueFinish; - pDdiTable->pfnFlush = urQueueFlush; - pDdiTable->pfnGetInfo = urQueueGetInfo; - pDdiTable->pfnGetNativeHandle = urQueueGetNativeHandle; - pDdiTable->pfnRelease = urQueueRelease; - pDdiTable->pfnRetain = urQueueRetain; - return UR_RESULT_SUCCESS; -} - -UR_DLLEXPORT ur_result_t UR_APICALL -urGetUSMProcAddrTable(ur_api_version_t version, ur_usm_dditable_t *pDdiTable) { - auto result = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != result) { - return result; - } - pDdiTable->pfnDeviceAlloc = urUSMDeviceAlloc; - pDdiTable->pfnFree = urUSMFree; - pDdiTable->pfnGetMemAllocInfo = urUSMGetMemAllocInfo; - pDdiTable->pfnHostAlloc = urUSMHostAlloc; - pDdiTable->pfnPoolCreate = urUSMPoolCreate; - pDdiTable->pfnPoolRetain = urUSMPoolRetain; - pDdiTable->pfnPoolRelease = urUSMPoolRelease; - pDdiTable->pfnPoolGetInfo = urUSMPoolGetInfo; - pDdiTable->pfnSharedAlloc = urUSMSharedAlloc; - return UR_RESULT_SUCCESS; -} - -UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable( - ur_api_version_t version, ur_device_dditable_t *pDdiTable) { - auto result = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != result) { - return result; - } - pDdiTable->pfnCreateWithNativeHandle = urDeviceCreateWithNativeHandle; - pDdiTable->pfnGet = urDeviceGet; - pDdiTable->pfnGetGlobalTimestamps = urDeviceGetGlobalTimestamps; - pDdiTable->pfnGetInfo = urDeviceGetInfo; - pDdiTable->pfnGetNativeHandle = urDeviceGetNativeHandle; - pDdiTable->pfnPartition = urDevicePartition; - pDdiTable->pfnRelease = urDeviceRelease; - pDdiTable->pfnRetain = urDeviceRetain; - pDdiTable->pfnSelectBinary = urDeviceSelectBinary; - return UR_RESULT_SUCCESS; -} - -UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( - ur_api_version_t version, ///< [in] API version requested - ur_command_buffer_exp_dditable_t - *pDdiTable ///< [in,out] pointer to table of DDI function pointers -) { - auto retVal = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != retVal) { - return retVal; - } - pDdiTable->pfnCreateExp = urCommandBufferCreateExp; - pDdiTable->pfnRetainExp = urCommandBufferRetainExp; - pDdiTable->pfnReleaseExp = urCommandBufferReleaseExp; - pDdiTable->pfnFinalizeExp = urCommandBufferFinalizeExp; - pDdiTable->pfnAppendKernelLaunchExp = urCommandBufferAppendKernelLaunchExp; - pDdiTable->pfnAppendMemcpyUSMExp = urCommandBufferAppendMemcpyUSMExp; - pDdiTable->pfnAppendMembufferCopyExp = urCommandBufferAppendMembufferCopyExp; - pDdiTable->pfnAppendMembufferCopyRectExp = - urCommandBufferAppendMembufferCopyRectExp; - pDdiTable->pfnAppendMembufferReadExp = urCommandBufferAppendMembufferReadExp; - pDdiTable->pfnAppendMembufferReadRectExp = - urCommandBufferAppendMembufferReadRectExp; - pDdiTable->pfnAppendMembufferWriteExp = - urCommandBufferAppendMembufferWriteExp; - pDdiTable->pfnAppendMembufferWriteRectExp = - urCommandBufferAppendMembufferWriteRectExp; - pDdiTable->pfnEnqueueExp = urCommandBufferEnqueueExp; - - return retVal; -} - -UR_DLLEXPORT ur_result_t UR_APICALL urGetUsmP2PExpProcAddrTable( - ur_api_version_t version, ur_usm_p2p_exp_dditable_t *pDdiTable) { - auto retVal = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != retVal) { - return retVal; - } - pDdiTable->pfnEnablePeerAccessExp = urUsmP2PEnablePeerAccessExp; - pDdiTable->pfnDisablePeerAccessExp = urUsmP2PDisablePeerAccessExp; - pDdiTable->pfnPeerAccessGetInfoExp = urUsmP2PPeerAccessGetInfoExp; - - return retVal; -} - -UR_DLLEXPORT ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable( - ur_api_version_t version, ur_bindless_images_exp_dditable_t *pDdiTable) { - auto result = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != result) { - return result; - } - pDdiTable->pfnUnsampledImageHandleDestroyExp = - urBindlessImagesUnsampledImageHandleDestroyExp; - pDdiTable->pfnSampledImageHandleDestroyExp = - urBindlessImagesSampledImageHandleDestroyExp; - pDdiTable->pfnImageAllocateExp = urBindlessImagesImageAllocateExp; - pDdiTable->pfnImageFreeExp = urBindlessImagesImageFreeExp; - pDdiTable->pfnUnsampledImageCreateExp = - urBindlessImagesUnsampledImageCreateExp; - pDdiTable->pfnSampledImageCreateExp = urBindlessImagesSampledImageCreateExp; - pDdiTable->pfnImageCopyExp = urBindlessImagesImageCopyExp; - pDdiTable->pfnImageGetInfoExp = urBindlessImagesImageGetInfoExp; - pDdiTable->pfnMipmapGetLevelExp = urBindlessImagesMipmapGetLevelExp; - pDdiTable->pfnMipmapFreeExp = urBindlessImagesMipmapFreeExp; - pDdiTable->pfnImportOpaqueFDExp = urBindlessImagesImportOpaqueFDExp; - pDdiTable->pfnMapExternalArrayExp = urBindlessImagesMapExternalArrayExp; - pDdiTable->pfnReleaseInteropExp = urBindlessImagesReleaseInteropExp; - pDdiTable->pfnImportExternalSemaphoreOpaqueFDExp = - urBindlessImagesImportExternalSemaphoreOpaqueFDExp; - pDdiTable->pfnDestroyExternalSemaphoreExp = - urBindlessImagesDestroyExternalSemaphoreExp; - pDdiTable->pfnWaitExternalSemaphoreExp = - urBindlessImagesWaitExternalSemaphoreExp; - pDdiTable->pfnSignalExternalSemaphoreExp = - urBindlessImagesSignalExternalSemaphoreExp; - return UR_RESULT_SUCCESS; -} - -UR_DLLEXPORT ur_result_t UR_APICALL urGetUSMExpProcAddrTable( - ur_api_version_t version, ur_usm_exp_dditable_t *pDdiTable) { - auto result = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != result) { - return result; - } - pDdiTable->pfnPitchedAllocExp = urUSMPitchedAllocExp; - return UR_RESULT_SUCCESS; -} - -#if defined(__cplusplus) -} // extern "C" -#endif diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp deleted file mode 100644 index d272a836e600a..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp +++ /dev/null @@ -1,503 +0,0 @@ -//===--------- usm.cpp - CUDA Adapter -------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include - -#include "adapter.hpp" -#include "common.hpp" -#include "context.hpp" -#include "device.hpp" -#include "event.hpp" -#include "platform.hpp" -#include "queue.hpp" -#include "usm.hpp" - -#include - -/// USM: Implements USM Host allocations using CUDA Pinned Memory -/// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#page-locked-host-memory -UR_APIEXPORT ur_result_t UR_APICALL -urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc, - ur_usm_pool_handle_t hPool, size_t size, void **ppMem) { - auto alignment = pUSMDesc ? pUSMDesc->align : 0u; - UR_ASSERT(!pUSMDesc || - (alignment == 0 || ((alignment & (alignment - 1)) == 0)), - UR_RESULT_ERROR_INVALID_VALUE); - - if (!hPool) { - return USMHostAllocImpl(ppMem, hContext, nullptr, size, alignment); - } - - auto UMFPool = hPool->HostMemPool.get(); - *ppMem = umfPoolAlignedMalloc(UMFPool, size, alignment); - if (*ppMem == nullptr) { - auto umfErr = umfPoolGetLastAllocationError(UMFPool); - return umf::umf2urResult(umfErr); - } - return UR_RESULT_SUCCESS; -} - -/// USM: Implements USM device allocations using a normal CUDA device pointer -/// -UR_APIEXPORT ur_result_t UR_APICALL -urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, - const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t hPool, - size_t size, void **ppMem) { - auto alignment = pUSMDesc ? pUSMDesc->align : 0u; - UR_ASSERT(!pUSMDesc || - (alignment == 0 || ((alignment & (alignment - 1)) == 0)), - UR_RESULT_ERROR_INVALID_VALUE); - - if (!hPool) { - return USMDeviceAllocImpl(ppMem, hContext, hDevice, nullptr, size, - alignment); - } - - auto UMFPool = hPool->DeviceMemPool.get(); - *ppMem = umfPoolAlignedMalloc(UMFPool, size, alignment); - if (*ppMem == nullptr) { - auto umfErr = umfPoolGetLastAllocationError(UMFPool); - return umf::umf2urResult(umfErr); - } - return UR_RESULT_SUCCESS; -} - -/// USM: Implements USM Shared allocations using CUDA Managed Memory -/// -UR_APIEXPORT ur_result_t UR_APICALL -urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, - const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t hPool, - size_t size, void **ppMem) { - auto alignment = pUSMDesc ? pUSMDesc->align : 0u; - UR_ASSERT(!pUSMDesc || - (alignment == 0 || ((alignment & (alignment - 1)) == 0)), - UR_RESULT_ERROR_INVALID_VALUE); - - if (!hPool) { - return USMSharedAllocImpl(ppMem, hContext, hDevice, nullptr, nullptr, size, - alignment); - } - - auto UMFPool = hPool->SharedMemPool.get(); - *ppMem = umfPoolAlignedMalloc(UMFPool, size, alignment); - if (*ppMem == nullptr) { - auto umfErr = umfPoolGetLastAllocationError(UMFPool); - return umf::umf2urResult(umfErr); - } - return UR_RESULT_SUCCESS; -} - -ur_result_t USMFreeImpl(ur_context_handle_t Context, void *Pointer) { - ur_result_t Result = UR_RESULT_SUCCESS; - try { - ScopedContext Active(Context); - bool IsManaged; - unsigned int Type; - void *AttributeValues[2] = {&IsManaged, &Type}; - CUpointer_attribute Attributes[2] = {CU_POINTER_ATTRIBUTE_IS_MANAGED, - CU_POINTER_ATTRIBUTE_MEMORY_TYPE}; - UR_CHECK_ERROR(cuPointerGetAttributes(2, Attributes, AttributeValues, - (CUdeviceptr)Pointer)); - UR_ASSERT(Type == CU_MEMORYTYPE_DEVICE || Type == CU_MEMORYTYPE_HOST, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); - if (IsManaged || Type == CU_MEMORYTYPE_DEVICE) { - // Memory allocated with cuMemAlloc and cuMemAllocManaged must be freed - // with cuMemFree - UR_CHECK_ERROR(cuMemFree((CUdeviceptr)Pointer)); - } else { - // Memory allocated with cuMemAllocHost must be freed with cuMemFreeHost - UR_CHECK_ERROR(cuMemFreeHost(Pointer)); - } - } catch (ur_result_t Err) { - Result = Err; - } - return Result; -} - -/// USM: Frees the given USM pointer associated with the context. -/// -UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext, - void *pMem) { - if (auto Pool = umfPoolByPtr(pMem)) - return umf::umf2urResult(umfPoolFree(Pool, pMem)); - return USMFreeImpl(hContext, pMem); -} - -ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_device_handle_t, ur_usm_device_mem_flags_t *, - size_t Size, uint32_t Alignment) { - try { - ScopedContext Active(Context); - UR_CHECK_ERROR(cuMemAlloc((CUdeviceptr *)ResultPtr, Size)); - } catch (ur_result_t Err) { - return Err; - } - -#ifdef NDEBUG - std::ignore = Alignment; -#else - assert((Alignment == 0 || - reinterpret_cast(*ResultPtr) % Alignment == 0)); -#endif - return UR_RESULT_SUCCESS; -} - -ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_device_handle_t, ur_usm_host_mem_flags_t *, - ur_usm_device_mem_flags_t *, size_t Size, - uint32_t Alignment) { - try { - ScopedContext Active(Context); - UR_CHECK_ERROR(cuMemAllocManaged((CUdeviceptr *)ResultPtr, Size, - CU_MEM_ATTACH_GLOBAL)); - } catch (ur_result_t Err) { - return Err; - } - -#ifdef NDEBUG - std::ignore = Alignment; -#else - assert((Alignment == 0 || - reinterpret_cast(*ResultPtr) % Alignment == 0)); -#endif - return UR_RESULT_SUCCESS; -} - -ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_usm_host_mem_flags_t *, size_t Size, - uint32_t Alignment) { - try { - ScopedContext Active(Context); - UR_CHECK_ERROR(cuMemAllocHost(ResultPtr, Size)); - } catch (ur_result_t Err) { - return Err; - } - -#ifdef NDEBUG - std::ignore = Alignment; -#else - assert((Alignment == 0 || - reinterpret_cast(*ResultPtr) % Alignment == 0)); -#endif - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL -urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem, - ur_usm_alloc_info_t propName, size_t propValueSize, - void *pPropValue, size_t *pPropValueSizeRet) { - ur_result_t Result = UR_RESULT_SUCCESS; - - UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet); - - try { - ScopedContext Active(hContext); - switch (propName) { - case UR_USM_ALLOC_INFO_TYPE: { - unsigned int Value; - // do not throw if cuPointerGetAttribute returns CUDA_ERROR_INVALID_VALUE - CUresult Ret = cuPointerGetAttribute( - &Value, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem); - if (Ret == CUDA_ERROR_INVALID_VALUE) { - // pointer not known to the CUDA subsystem - return ReturnValue(UR_USM_TYPE_UNKNOWN); - } - checkErrorUR(Ret, __func__, __LINE__ - 5, __FILE__); - if (Value) { - // pointer to managed memory - return ReturnValue(UR_USM_TYPE_SHARED); - } - UR_CHECK_ERROR(cuPointerGetAttribute( - &Value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)pMem)); - UR_ASSERT(Value == CU_MEMORYTYPE_DEVICE || Value == CU_MEMORYTYPE_HOST, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); - if (Value == CU_MEMORYTYPE_DEVICE) { - // pointer to device memory - return ReturnValue(UR_USM_TYPE_DEVICE); - } - if (Value == CU_MEMORYTYPE_HOST) { - // pointer to host memory - return ReturnValue(UR_USM_TYPE_HOST); - } - // should never get here -#ifdef _MSC_VER - __assume(0); -#else - __builtin_unreachable(); -#endif - } - case UR_USM_ALLOC_INFO_BASE_PTR: { -#if CUDA_VERSION >= 10020 - // CU_POINTER_ATTRIBUTE_RANGE_START_ADDR was introduced in CUDA 10.2 - void *Base; - UR_CHECK_ERROR(cuPointerGetAttribute( - &Base, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, (CUdeviceptr)pMem)); - return ReturnValue(Base); -#else - return UR_RESULT_ERROR_INVALID_VALUE; -#endif - } - case UR_USM_ALLOC_INFO_SIZE: { -#if CUDA_VERSION >= 10020 - // CU_POINTER_ATTRIBUTE_RANGE_SIZE was introduced in CUDA 10.2 - size_t Value; - UR_CHECK_ERROR(cuPointerGetAttribute( - &Value, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem)); - return ReturnValue(Value); -#else - return UR_RESULT_ERROR_INVALID_VALUE; -#endif - } - case UR_USM_ALLOC_INFO_DEVICE: { - // get device index associated with this pointer - unsigned int DeviceIndex; - UR_CHECK_ERROR(cuPointerGetAttribute(&DeviceIndex, - CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, - (CUdeviceptr)pMem)); - - // currently each device is in its own platform, so find the platform at - // the same index - std::vector Platforms; - Platforms.resize(DeviceIndex + 1); - ur_adapter_handle_t AdapterHandle = &adapter; - Result = urPlatformGet(&AdapterHandle, 1, DeviceIndex + 1, - Platforms.data(), nullptr); - - // get the device from the platform - ur_device_handle_t Device = Platforms[DeviceIndex]->Devices[0].get(); - return ReturnValue(Device); - } - case UR_USM_ALLOC_INFO_POOL: { - auto UMFPool = umfPoolByPtr(pMem); - if (!UMFPool) { - return UR_RESULT_ERROR_INVALID_VALUE; - } - ur_usm_pool_handle_t Pool = hContext->getOwningURPool(UMFPool); - if (!Pool) { - return UR_RESULT_ERROR_INVALID_VALUE; - } - return ReturnValue(Pool); - } - default: - return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; - } - } catch (ur_result_t Err) { - Result = Err; - } - return Result; -} - -UR_APIEXPORT ur_result_t UR_APICALL urUSMImportExp(ur_context_handle_t Context, - void *HostPtr, size_t Size) { - UR_ASSERT(Context, UR_RESULT_ERROR_INVALID_CONTEXT); - UR_ASSERT(!HostPtr, UR_RESULT_ERROR_INVALID_VALUE); - UR_ASSERT(Size > 0, UR_RESULT_ERROR_INVALID_VALUE); - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urUSMReleaseExp(ur_context_handle_t Context, - void *HostPtr) { - UR_ASSERT(Context, UR_RESULT_ERROR_INVALID_CONTEXT); - UR_ASSERT(!HostPtr, UR_RESULT_ERROR_INVALID_VALUE); - return UR_RESULT_SUCCESS; -} - -umf_result_t USMMemoryProvider::initialize(ur_context_handle_t Ctx, - ur_device_handle_t Dev) { - Context = Ctx; - Device = Dev; - // There isn't a way to query this in cuda, and there isn't much info on - // cuda's approach to alignment or transfer granularity between host and - // device. Within UMF this is only used to influence alignment, and since we - // discard that in our alloc implementations it seems we can safely ignore - // this as well, for now. - MinPageSize = 0; - - return UMF_RESULT_SUCCESS; -} - -enum umf_result_t USMMemoryProvider::alloc(size_t Size, size_t Align, - void **Ptr) { - auto Res = allocateImpl(Ptr, Size, Align); - if (Res != UR_RESULT_SUCCESS) { - getLastStatusRef() = Res; - return UMF_RESULT_ERROR_MEMORY_PROVIDER_SPECIFIC; - } - - return UMF_RESULT_SUCCESS; -} - -enum umf_result_t USMMemoryProvider::free(void *Ptr, size_t Size) { - (void)Size; - - auto Res = USMFreeImpl(Context, Ptr); - if (Res != UR_RESULT_SUCCESS) { - getLastStatusRef() = Res; - return UMF_RESULT_ERROR_MEMORY_PROVIDER_SPECIFIC; - } - - return UMF_RESULT_SUCCESS; -} - -void USMMemoryProvider::get_last_native_error(const char **ErrMsg, - int32_t *ErrCode) { - (void)ErrMsg; - *ErrCode = static_cast(getLastStatusRef()); -} - -umf_result_t USMMemoryProvider::get_min_page_size(void *Ptr, size_t *PageSize) { - (void)Ptr; - *PageSize = MinPageSize; - - return UMF_RESULT_SUCCESS; -} - -ur_result_t USMSharedMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, - uint32_t Alignment) { - return USMSharedAllocImpl(ResultPtr, Context, Device, nullptr, nullptr, Size, - Alignment); -} - -ur_result_t USMDeviceMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, - uint32_t Alignment) { - return USMDeviceAllocImpl(ResultPtr, Context, Device, nullptr, Size, - Alignment); -} - -ur_result_t USMHostMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, - uint32_t Alignment) { - return USMHostAllocImpl(ResultPtr, Context, nullptr, Size, Alignment); -} - -ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context, - ur_usm_pool_desc_t *PoolDesc) - : Context(Context) { - const void *pNext = PoolDesc->pNext; - while (pNext != nullptr) { - const ur_base_desc_t *BaseDesc = static_cast(pNext); - switch (BaseDesc->stype) { - case UR_STRUCTURE_TYPE_USM_POOL_LIMITS_DESC: { - const ur_usm_pool_limits_desc_t *Limits = - reinterpret_cast(BaseDesc); - for (auto &config : DisjointPoolConfigs.Configs) { - config.MaxPoolableSize = Limits->maxPoolableSize; - config.SlabMinSize = Limits->minDriverAllocSize; - } - break; - } - default: { - throw UsmAllocationException(UR_RESULT_ERROR_INVALID_ARGUMENT); - } - } - pNext = BaseDesc->pNext; - } - - auto MemProvider = - umf::memoryProviderMakeUnique(Context, nullptr) - .second; - - HostMemPool = - umf::poolMakeUnique( - {std::move(MemProvider)}, - this->DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Host]) - .second; - - auto Device = Context->DeviceID; - MemProvider = - umf::memoryProviderMakeUnique(Context, Device) - .second; - DeviceMemPool = - umf::poolMakeUnique( - {std::move(MemProvider)}, - this->DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Device]) - .second; - - MemProvider = - umf::memoryProviderMakeUnique(Context, Device) - .second; - SharedMemPool = - umf::poolMakeUnique( - {std::move(MemProvider)}, - this->DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Shared]) - .second; - Context->addPool(this); -} - -bool ur_usm_pool_handle_t_::hasUMFPool(umf_memory_pool_t *umf_pool) { - return DeviceMemPool.get() == umf_pool || SharedMemPool.get() == umf_pool || - HostMemPool.get() == umf_pool; -} - -UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolCreate( - ur_context_handle_t Context, ///< [in] handle of the context object - ur_usm_pool_desc_t - *PoolDesc, ///< [in] pointer to USM pool descriptor. Can be chained with - ///< ::ur_usm_pool_limits_desc_t - ur_usm_pool_handle_t *Pool ///< [out] pointer to USM memory pool -) { - // Without pool tracking we can't free pool allocations. -#ifdef UMF_ENABLE_POOL_TRACKING - if (PoolDesc->flags & UR_USM_POOL_FLAG_ZERO_INITIALIZE_BLOCK) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - try { - *Pool = reinterpret_cast( - new ur_usm_pool_handle_t_(Context, PoolDesc)); - } catch (const UsmAllocationException &Ex) { - return Ex.getError(); - } - return UR_RESULT_SUCCESS; -#else - std::ignore = Context; - std::ignore = PoolDesc; - std::ignore = Pool; - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -#endif -} - -UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolRetain( - ur_usm_pool_handle_t Pool ///< [in] pointer to USM memory pool -) { - Pool->incrementReferenceCount(); - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolRelease( - ur_usm_pool_handle_t Pool ///< [in] pointer to USM memory pool -) { - if (Pool->decrementReferenceCount() > 0) { - return UR_RESULT_SUCCESS; - } - Pool->Context->removePool(Pool); - delete Pool; - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolGetInfo( - ur_usm_pool_handle_t hPool, ///< [in] handle of the USM memory pool - ur_usm_pool_info_t propName, ///< [in] name of the pool property to query - size_t propSize, ///< [in] size in bytes of the pool property value provided - void *pPropValue, ///< [out][optional][typename(propName, propSize)] value - ///< of the pool property - size_t *pPropSizeRet ///< [out][optional] size in bytes returned in pool - ///< property value -) { - UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); - - switch (propName) { - case UR_USM_POOL_INFO_REFERENCE_COUNT: { - return ReturnValue(hPool->getReferenceCount()); - } - case UR_USM_POOL_INFO_CONTEXT: { - return ReturnValue(hPool->Context); - } - default: { - return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; - } - } -} diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.hpp deleted file mode 100644 index d4cfba7641f30..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.hpp +++ /dev/null @@ -1,130 +0,0 @@ -//===--------- usm.hpp - CUDA Adapter -------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "common.hpp" - -#include -#include - -usm::DisjointPoolAllConfigs InitializeDisjointPoolConfig(); - -struct ur_usm_pool_handle_t_ { - std::atomic_uint32_t RefCount = 1; - - ur_context_handle_t Context = nullptr; - - usm::DisjointPoolAllConfigs DisjointPoolConfigs = - usm::DisjointPoolAllConfigs(); - - umf::pool_unique_handle_t DeviceMemPool; - umf::pool_unique_handle_t SharedMemPool; - umf::pool_unique_handle_t HostMemPool; - - ur_usm_pool_handle_t_(ur_context_handle_t Context, - ur_usm_pool_desc_t *PoolDesc); - - uint32_t incrementReferenceCount() noexcept { return ++RefCount; } - - uint32_t decrementReferenceCount() noexcept { return --RefCount; } - - uint32_t getReferenceCount() const noexcept { return RefCount; } - - bool hasUMFPool(umf_memory_pool_t *umf_pool); -}; - -// Exception type to pass allocation errors -class UsmAllocationException { - const ur_result_t Error; - -public: - UsmAllocationException(ur_result_t Err) : Error{Err} {} - ur_result_t getError() const { return Error; } -}; - -// Implements memory allocation via driver API for USM allocator interface. -class USMMemoryProvider { -private: - ur_result_t &getLastStatusRef() { - static thread_local ur_result_t LastStatus = UR_RESULT_SUCCESS; - return LastStatus; - } - -protected: - ur_context_handle_t Context; - ur_device_handle_t Device; - size_t MinPageSize; - - // Internal allocation routine which must be implemented for each allocation - // type - virtual ur_result_t allocateImpl(void **ResultPtr, size_t Size, - uint32_t Alignment) = 0; - -public: - umf_result_t initialize(ur_context_handle_t Ctx, ur_device_handle_t Dev); - umf_result_t alloc(size_t Size, size_t Align, void **Ptr); - umf_result_t free(void *Ptr, size_t Size); - void get_last_native_error(const char **ErrMsg, int32_t *ErrCode); - umf_result_t get_min_page_size(void *, size_t *); - umf_result_t get_recommended_page_size(size_t, size_t *) { - return UMF_RESULT_ERROR_NOT_SUPPORTED; - }; - umf_result_t purge_lazy(void *, size_t) { - return UMF_RESULT_ERROR_NOT_SUPPORTED; - }; - umf_result_t purge_force(void *, size_t) { - return UMF_RESULT_ERROR_NOT_SUPPORTED; - }; - virtual const char *get_name() = 0; - - virtual ~USMMemoryProvider() = default; -}; - -// Allocation routines for shared memory type -class USMSharedMemoryProvider final : public USMMemoryProvider { -public: - const char *get_name() override { return "USMSharedMemoryProvider"; } - -protected: - ur_result_t allocateImpl(void **ResultPtr, size_t Size, - uint32_t Alignment) override; -}; - -// Allocation routines for device memory type -class USMDeviceMemoryProvider final : public USMMemoryProvider { -public: - const char *get_name() override { return "USMSharedMemoryProvider"; } - -protected: - ur_result_t allocateImpl(void **ResultPtr, size_t Size, - uint32_t Alignment) override; -}; - -// Allocation routines for host memory type -class USMHostMemoryProvider final : public USMMemoryProvider { -public: - const char *get_name() override { return "USMSharedMemoryProvider"; } - -protected: - ur_result_t allocateImpl(void **ResultPtr, size_t Size, - uint32_t Alignment) override; -}; - -ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_device_handle_t Device, - ur_usm_device_mem_flags_t *Flags, size_t Size, - uint32_t Alignment); - -ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_device_handle_t Device, - ur_usm_host_mem_flags_t *, - ur_usm_device_mem_flags_t *, size_t Size, - uint32_t Alignment); - -ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_usm_host_mem_flags_t *Flags, size_t Size, - uint32_t Alignment); diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/usm_p2p.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/usm_p2p.cpp deleted file mode 100644 index ed580dd5d8065..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/usm_p2p.cpp +++ /dev/null @@ -1,69 +0,0 @@ -//===--------- usm_p2p.cpp - CUDA Adapter----------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "common.hpp" -#include "context.hpp" - -UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PEnablePeerAccessExp( - ur_device_handle_t commandDevice, ur_device_handle_t peerDevice) { - - ur_result_t result = UR_RESULT_SUCCESS; - try { - ScopedContext active(commandDevice->getContext()); - UR_CHECK_ERROR(cuCtxEnablePeerAccess(peerDevice->getContext(), 0)); - } catch (ur_result_t err) { - result = err; - } - return result; -} - -UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PDisablePeerAccessExp( - ur_device_handle_t commandDevice, ur_device_handle_t peerDevice) { - - ur_result_t result = UR_RESULT_SUCCESS; - try { - ScopedContext active(commandDevice->getContext()); - UR_CHECK_ERROR(cuCtxDisablePeerAccess(peerDevice->getContext())); - } catch (ur_result_t err) { - result = err; - } - return result; -} - -UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( - ur_device_handle_t commandDevice, ur_device_handle_t peerDevice, - ur_exp_peer_info_t propName, size_t propSize, void *pPropValue, - size_t *pPropSizeRet) { - - UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); - - int value; - CUdevice_P2PAttribute cu_attr; - try { - ScopedContext active(commandDevice->getContext()); - switch (propName) { - case UR_EXP_PEER_INFO_UR_PEER_ACCESS_SUPPORTED: { - cu_attr = CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED; - break; - } - case UR_EXP_PEER_INFO_UR_PEER_ATOMICS_SUPPORTED: { - cu_attr = CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED; - break; - } - default: { - return UR_RESULT_ERROR_INVALID_ENUMERATION; - } - } - - UR_CHECK_ERROR(cuDeviceGetP2PAttribute( - &value, cu_attr, commandDevice->get(), peerDevice->get())); - } catch (ur_result_t err) { - return err; - } - return ReturnValue(value); -} From 345a913cdc908be75bc3d188ff9c1d5eb6570795 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Mestre?= Date: Mon, 2 Oct 2023 14:01:21 +0100 Subject: [PATCH 2/2] Update UR commit --- sycl/plugins/unified_runtime/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index df97b6eb07812..7f8059df7c9c6 100644 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -4,7 +4,7 @@ if (NOT DEFINED UNIFIED_RUNTIME_LIBRARY OR NOT DEFINED UNIFIED_RUNTIME_INCLUDE_D include(FetchContent) set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") - set(UNIFIED_RUNTIME_TAG 00c7edb98f0c57ad968196a9cef393c380b6d6f7) + set(UNIFIED_RUNTIME_TAG 6a0eb7eff8a955fcd53edc79653f6bc85ef922f9) set(UR_BUILD_ADAPTER_L0 ON)