diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp index 9570baad23..ad46884bf1 100644 --- a/source/adapters/cuda/command_buffer.cpp +++ b/source/adapters/cuda/command_buffer.cpp @@ -9,48 +9,143 @@ //===----------------------------------------------------------------------===// #include "command_buffer.hpp" + #include "common.hpp" +#include "enqueue.hpp" +#include "event.hpp" +#include "kernel.hpp" +#include "memory.hpp" +#include "queue.hpp" + +#include + +ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( + ur_context_handle_t hContext, ur_device_handle_t hDevice) + : Context(hContext), + Device(hDevice), CudaGraph{nullptr}, CudaGraphExec{nullptr}, RefCount{1} { + urContextRetain(hContext); + urDeviceRetain(hDevice); +} + +/// The ur_exp_command_buffer_handle_t_ destructor releases +/// all the memory objects allocated for command_buffer managment +ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { + // Release the memory allocated to the Context stored in the command_buffer + UR_TRACE(urContextRelease(Context)); + + // Release the device + UR_TRACE(urDeviceRelease(Device)); + + // Release the memory allocated to the CudaGraph + cuGraphDestroy(CudaGraph); + + // Release the memory allocated to the CudaGraphExec + cuGraphExecDestroy(CudaGraphExec); +} -/// Stub implementations of UR experimental feature command-buffers +/// Helper function for finding the Cuda Nodes associated with the +/// commands in a command-buffer, each event is pointed to by a sync-point in +/// the wait list. +/// +/// @param[in] CommandBuffer to lookup the events from. +/// @param[in] NumSyncPointsInWaitList Length of \p SyncPointWaitList. +/// @param[in] SyncPointWaitList List of sync points in \p CommandBuffer +/// to find the events for. +/// @param[out] CuNodesList Return parameter for the Cuda Nodes associated with +/// each sync-point in \p SyncPointWaitList. +/// +/// @return UR_RESULT_SUCCESS or an error code on failure +static ur_result_t getNodesFromSyncPoints( + const ur_exp_command_buffer_handle_t &CommandBuffer, + size_t NumSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, + std::vector &CuNodesList) { + // Map of ur_exp_command_buffer_sync_point_t to ur_event_handle_t defining + // the event associated with each sync-point + auto SyncPoints = CommandBuffer->SyncPoints; + + // For each sync-point add associated CUDA graph node to the return list. + for (size_t i = 0; i < NumSyncPointsInWaitList; i++) { + if (auto NodeHandle = SyncPoints.find(SyncPointWaitList[i]); + NodeHandle != SyncPoints.end()) { + CuNodesList.push_back(*NodeHandle->second.get()); + } else { + return UR_RESULT_ERROR_INVALID_VALUE; + } + } + return UR_RESULT_SUCCESS; +} + +/// Set parameter for General 1D memory copy. +/// If the source and/or destination is on the device, SrcPtr and/or DstPtr +/// must be a pointer to a CUdeviceptr +static void setCopyParams(const void *SrcPtr, const CUmemorytype_enum SrcType, + void *DstPtr, const CUmemorytype_enum DstType, + size_t Size, CUDA_MEMCPY3D &Params) { + // Set all params to 0 first + std::memset(&Params, 0, sizeof(CUDA_MEMCPY3D)); + + Params.srcMemoryType = SrcType; + Params.srcDevice = SrcType == CU_MEMORYTYPE_DEVICE + ? *static_cast(SrcPtr) + : 0; + Params.srcHost = SrcType == CU_MEMORYTYPE_HOST ? SrcPtr : nullptr; + Params.dstMemoryType = DstType; + Params.dstDevice = + DstType == CU_MEMORYTYPE_DEVICE ? *static_cast(DstPtr) : 0; + Params.dstHost = DstType == CU_MEMORYTYPE_HOST ? DstPtr : nullptr; + Params.WidthInBytes = Size; + Params.Height = 1; + Params.Depth = 1; +} UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_exp_command_buffer_desc_t *pCommandBufferDesc, ur_exp_command_buffer_handle_t *phCommandBuffer) { - (void)hContext; - (void)hDevice; (void)pCommandBufferDesc; - (void)phCommandBuffer; - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + + try { + *phCommandBuffer = new ur_exp_command_buffer_handle_t_(hContext, hDevice); + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + try { + UR_CHECK_ERROR(cuGraphCreate(&(*phCommandBuffer)->CudaGraph, 0)); + } catch (...) { + return UR_RESULT_ERROR_OUT_OF_RESOURCES; + } + + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - (void)hCommandBuffer; - - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + hCommandBuffer->incrementReferenceCount(); + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - (void)hCommandBuffer; + if (hCommandBuffer->decrementReferenceCount() != 0) + return UR_RESULT_SUCCESS; - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + delete hCommandBuffer; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) { - (void)hCommandBuffer; - - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + try { + UR_CHECK_ERROR(cuGraphInstantiate(&hCommandBuffer->CudaGraphExec, + hCommandBuffer->CudaGraph, 0)); + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( @@ -60,19 +155,85 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ur_exp_command_buffer_sync_point_t *pSyncPoint) { - (void)hCommandBuffer; - (void)hKernel; - (void)workDim; - (void)pGlobalWorkOffset; - (void)pGlobalWorkSize; - (void)pLocalWorkSize; - (void)numSyncPointsInWaitList; - (void)pSyncPointWaitList; - (void)pSyncPoint; + // Preconditions + UR_ASSERT(hCommandBuffer->Context == hKernel->getContext(), + UR_RESULT_ERROR_INVALID_KERNEL); + UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + + ur_result_t Result = UR_RESULT_SUCCESS; + CUgraphNode GraphNode; + + std::vector DepsList; + + UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList), + Result); + + if (Result != UR_RESULT_SUCCESS) { + return Result; + } + + if (*pGlobalWorkSize == 0) { + try { + // Create an empty node if the kernel workload size is zero + UR_CHECK_ERROR(cuGraphAddEmptyNode(&GraphNode, hCommandBuffer->CudaGraph, + DepsList.data(), DepsList.size())); + + // Get sync point and register the cuNode with it. + *pSyncPoint = hCommandBuffer->AddSyncPoint( + std::make_shared(GraphNode)); + } catch (ur_result_t Err) { + Result = Err; + } + return Result; + } + + // Set the number of threads per block to the number of threads per warp + // by default unless user has provided a better number + size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; + size_t BlocksPerGrid[3] = {1u, 1u, 1u}; - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + uint32_t LocalSize = hKernel->getLocalSize(); + CUfunction CuFunc = hKernel->get(); + Result = + setKernelParams(hCommandBuffer->Context, hCommandBuffer->Device, workDim, + pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid); + if (Result != UR_RESULT_SUCCESS) { + return Result; + } + + try { + // Set node param structure with the kernel related data + auto &ArgIndices = hKernel->getArgIndices(); + CUDA_KERNEL_NODE_PARAMS NodeParams; + NodeParams.func = CuFunc; + NodeParams.gridDimX = BlocksPerGrid[0]; + NodeParams.gridDimY = BlocksPerGrid[1]; + NodeParams.gridDimZ = BlocksPerGrid[2]; + NodeParams.blockDimX = ThreadsPerBlock[0]; + NodeParams.blockDimY = ThreadsPerBlock[1]; + NodeParams.blockDimZ = ThreadsPerBlock[2]; + NodeParams.sharedMemBytes = LocalSize; + NodeParams.kernelParams = const_cast(ArgIndices.data()); + NodeParams.extra = nullptr; + + // Create and add an new kernel node to the Cuda graph + UR_CHECK_ERROR(cuGraphAddKernelNode(&GraphNode, hCommandBuffer->CudaGraph, + DepsList.data(), DepsList.size(), + &NodeParams)); + + if (LocalSize != 0) + hKernel->clearLocalSize(); + + // Get sync point and register the cuNode with it. + *pSyncPoint = + hCommandBuffer->AddSyncPoint(std::make_shared(GraphNode)); + } catch (ur_result_t Err) { + Result = Err; + } + return Result; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemcpyUSMExp( @@ -80,17 +241,33 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemcpyUSMExp( size_t size, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ur_exp_command_buffer_sync_point_t *pSyncPoint) { - (void)hCommandBuffer; - (void)pDst; - (void)pSrc; - (void)size; - (void)numSyncPointsInWaitList; - (void)pSyncPointWaitList; - (void)pSyncPoint; + ur_result_t Result = UR_RESULT_SUCCESS; + CUgraphNode GraphNode; + std::vector DepsList; + UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList), + Result); - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + if (Result != UR_RESULT_SUCCESS) { + return Result; + } + + try { + CUDA_MEMCPY3D NodeParams = {}; + setCopyParams(pSrc, CU_MEMORYTYPE_HOST, pDst, CU_MEMORYTYPE_HOST, size, + NodeParams); + + UR_CHECK_ERROR(cuGraphAddMemcpyNode( + &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(), + &NodeParams, hCommandBuffer->Device->getContext())); + + // Get sync point and register the cuNode with it. + *pSyncPoint = + hCommandBuffer->AddSyncPoint(std::make_shared(GraphNode)); + } catch (ur_result_t Err) { + Result = Err; + } + return Result; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyExp( @@ -99,19 +276,42 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyExp( uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ur_exp_command_buffer_sync_point_t *pSyncPoint) { - (void)hCommandBuffer; - (void)hSrcMem; - (void)hDstMem; - (void)srcOffset; - (void)dstOffset; - (void)size; - (void)numSyncPointsInWaitList; - (void)pSyncPointWaitList; - (void)pSyncPoint; + ur_result_t Result = UR_RESULT_SUCCESS; + CUgraphNode GraphNode; + std::vector DepsList; + + UR_ASSERT(size + dstOffset <= std::get(hDstMem->Mem).getSize(), + UR_RESULT_ERROR_INVALID_SIZE); + UR_ASSERT(size + srcOffset <= std::get(hSrcMem->Mem).getSize(), + UR_RESULT_ERROR_INVALID_SIZE); - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList), + Result); + + if (Result != UR_RESULT_SUCCESS) { + return Result; + } + + try { + auto Src = std::get(hSrcMem->Mem).get() + srcOffset; + auto Dst = std::get(hDstMem->Mem).get() + dstOffset; + + CUDA_MEMCPY3D NodeParams = {}; + setCopyParams(&Src, CU_MEMORYTYPE_DEVICE, &Dst, CU_MEMORYTYPE_DEVICE, size, + NodeParams); + + UR_CHECK_ERROR(cuGraphAddMemcpyNode( + &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(), + &NodeParams, hCommandBuffer->Device->getContext())); + + // Get sync point and register the cuNode with it. + *pSyncPoint = + hCommandBuffer->AddSyncPoint(std::make_shared(GraphNode)); + } catch (ur_result_t Err) { + Result = Err; + } + return Result; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyRectExp( @@ -122,23 +322,37 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyRectExp( uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ur_exp_command_buffer_sync_point_t *pSyncPoint) { - (void)hCommandBuffer; - (void)hSrcMem; - (void)hDstMem; - (void)srcOrigin; - (void)dstOrigin; - (void)region; - (void)srcRowPitch; - (void)srcSlicePitch; - (void)dstRowPitch; - (void)dstSlicePitch; - (void)numSyncPointsInWaitList; - (void)pSyncPointWaitList; - (void)pSyncPoint; - - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + ur_result_t Result = UR_RESULT_SUCCESS; + CUgraphNode GraphNode; + std::vector DepsList; + UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList), + Result); + + if (Result != UR_RESULT_SUCCESS) { + return Result; + } + + try { + CUdeviceptr SrcPtr = std::get(hSrcMem->Mem).get(); + CUdeviceptr DstPtr = std::get(hDstMem->Mem).get(); + CUDA_MEMCPY3D NodeParams = {}; + + setCopyRectParams(region, &SrcPtr, CU_MEMORYTYPE_DEVICE, srcOrigin, + srcRowPitch, srcSlicePitch, &DstPtr, CU_MEMORYTYPE_DEVICE, + dstOrigin, dstRowPitch, dstSlicePitch, NodeParams); + + UR_CHECK_ERROR(cuGraphAddMemcpyNode( + &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(), + &NodeParams, hCommandBuffer->Device->getContext())); + + // Get sync point and register the cuNode with it. + *pSyncPoint = + hCommandBuffer->AddSyncPoint(std::make_shared(GraphNode)); + } catch (ur_result_t Err) { + Result = Err; + } + return Result; } UR_APIEXPORT @@ -148,18 +362,35 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferWriteExp( uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ur_exp_command_buffer_sync_point_t *pSyncPoint) { - (void)hCommandBuffer; - (void)hBuffer; - (void)offset; - (void)size; - (void)pSrc; - (void)numSyncPointsInWaitList; - (void)pSyncPointWaitList; - (void)pSyncPoint; + ur_result_t Result = UR_RESULT_SUCCESS; + CUgraphNode GraphNode; + std::vector DepsList; + UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList), + Result); + + if (Result != UR_RESULT_SUCCESS) { + return Result; + } + + try { + auto Dst = std::get(hBuffer->Mem).get() + offset; + + CUDA_MEMCPY3D NodeParams = {}; + setCopyParams(pSrc, CU_MEMORYTYPE_HOST, &Dst, CU_MEMORYTYPE_DEVICE, size, + NodeParams); - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + UR_CHECK_ERROR(cuGraphAddMemcpyNode( + &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(), + &NodeParams, hCommandBuffer->Device->getContext())); + + // Get sync point and register the cuNode with it. + *pSyncPoint = + hCommandBuffer->AddSyncPoint(std::make_shared(GraphNode)); + } catch (ur_result_t Err) { + Result = Err; + } + return Result; } UR_APIEXPORT @@ -168,18 +399,35 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferReadExp( size_t offset, size_t size, void *pDst, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ur_exp_command_buffer_sync_point_t *pSyncPoint) { - (void)hCommandBuffer; - (void)hBuffer; - (void)offset; - (void)size; - (void)pDst; - (void)numSyncPointsInWaitList; - (void)pSyncPointWaitList; - (void)pSyncPoint; + ur_result_t Result = UR_RESULT_SUCCESS; + CUgraphNode GraphNode; + std::vector DepsList; + UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList), + Result); + + if (Result != UR_RESULT_SUCCESS) { + return Result; + } + + try { + auto Src = std::get(hBuffer->Mem).get() + offset; + + CUDA_MEMCPY3D NodeParams = {}; + setCopyParams(&Src, CU_MEMORYTYPE_DEVICE, pDst, CU_MEMORYTYPE_HOST, size, + NodeParams); - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + UR_CHECK_ERROR(cuGraphAddMemcpyNode( + &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(), + &NodeParams, hCommandBuffer->Device->getContext())); + + // Get sync point and register the cuNode with it. + *pSyncPoint = + hCommandBuffer->AddSyncPoint(std::make_shared(GraphNode)); + } catch (ur_result_t Err) { + Result = Err; + } + return Result; } UR_APIEXPORT @@ -191,23 +439,37 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferWriteRectExp( uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ur_exp_command_buffer_sync_point_t *pSyncPoint) { - (void)hCommandBuffer; - (void)hBuffer; - (void)bufferOffset; - (void)hostOffset; - (void)region; - (void)bufferRowPitch; - (void)bufferSlicePitch; - (void)hostRowPitch; - (void)hostSlicePitch; - (void)pSrc; - (void)numSyncPointsInWaitList; - (void)pSyncPointWaitList; - (void)pSyncPoint; - - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + ur_result_t Result = UR_RESULT_SUCCESS; + CUgraphNode GraphNode; + std::vector DepsList; + UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList), + Result); + + if (Result != UR_RESULT_SUCCESS) { + return Result; + } + + try { + CUdeviceptr DstPtr = std::get(hBuffer->Mem).get(); + CUDA_MEMCPY3D NodeParams = {}; + + setCopyRectParams(region, pSrc, CU_MEMORYTYPE_HOST, hostOffset, + hostRowPitch, hostSlicePitch, &DstPtr, + CU_MEMORYTYPE_DEVICE, bufferOffset, bufferRowPitch, + bufferSlicePitch, NodeParams); + + UR_CHECK_ERROR(cuGraphAddMemcpyNode( + &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(), + &NodeParams, hCommandBuffer->Device->getContext())); + + // Get sync point and register the cuNode with it. + *pSyncPoint = + hCommandBuffer->AddSyncPoint(std::make_shared(GraphNode)); + } catch (ur_result_t Err) { + Result = Err; + } + return Result; } UR_APIEXPORT @@ -219,37 +481,75 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferReadRectExp( uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ur_exp_command_buffer_sync_point_t *pSyncPoint) { - (void)hCommandBuffer; - (void)hBuffer; - (void)bufferOffset; - (void)hostOffset; - (void)region; - (void)bufferRowPitch; - (void)bufferSlicePitch; - (void)hostRowPitch; - (void)hostSlicePitch; - (void)pDst; - - (void)numSyncPointsInWaitList; - (void)pSyncPointWaitList; - (void)pSyncPoint; - - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + ur_result_t Result = UR_RESULT_SUCCESS; + CUgraphNode GraphNode; + std::vector DepsList; + UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList), + Result); + + if (Result != UR_RESULT_SUCCESS) { + return Result; + } + + try { + CUdeviceptr SrcPtr = std::get(hBuffer->Mem).get(); + CUDA_MEMCPY3D NodeParams = {}; + + setCopyRectParams(region, &SrcPtr, CU_MEMORYTYPE_DEVICE, bufferOffset, + bufferRowPitch, bufferSlicePitch, pDst, + CU_MEMORYTYPE_HOST, hostOffset, hostRowPitch, + hostSlicePitch, NodeParams); + + UR_CHECK_ERROR(cuGraphAddMemcpyNode( + &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(), + &NodeParams, hCommandBuffer->Device->getContext())); + + // Get sync point and register the cuNode with it. + *pSyncPoint = + hCommandBuffer->AddSyncPoint(std::make_shared(GraphNode)); + } catch (ur_result_t Err) { + Result = Err; + } + return Result; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - (void)hCommandBuffer; - (void)hQueue; - (void)numEventsInWaitList; - (void)phEventWaitList; - (void)phEvent; - - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + ur_result_t Result = UR_RESULT_SUCCESS; + + try { + std::unique_ptr RetImplEvent{nullptr}; + ScopedContext Active(hQueue->getContext()); + uint32_t StreamToken; + ur_stream_guard_ Guard; + CUstream CuStream = hQueue->getNextComputeStream( + numEventsInWaitList, phEventWaitList, Guard, &StreamToken); + + if ((Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, + phEventWaitList)) != UR_RESULT_SUCCESS) { + return Result; + } + + if (phEvent) { + RetImplEvent = std::unique_ptr( + ur_event_handle_t_::makeNative(UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP, + hQueue, CuStream, StreamToken)); + UR_CHECK_ERROR(RetImplEvent->start()); + } + + // Launch graph + UR_CHECK_ERROR(cuGraphLaunch(hCommandBuffer->CudaGraphExec, CuStream)); + + if (phEvent) { + UR_CHECK_ERROR(RetImplEvent->record()); + *phEvent = RetImplEvent.release(); + } + } catch (ur_result_t Err) { + Result = Err; + } + + return Result; } diff --git a/source/adapters/cuda/command_buffer.hpp b/source/adapters/cuda/command_buffer.hpp index 0fe0be0a6d..4ceab42062 100644 --- a/source/adapters/cuda/command_buffer.hpp +++ b/source/adapters/cuda/command_buffer.hpp @@ -9,7 +9,222 @@ //===----------------------------------------------------------------------===// #include +#include -/// Stub implementation of command-buffers for CUDA +#include "context.hpp" +#include +#include -struct ur_exp_command_buffer_handle_t_ {}; +static inline const char *getUrResultString(ur_result_t Result) { + switch (Result) { + case UR_RESULT_SUCCESS: + return "UR_RESULT_SUCCESS"; + case UR_RESULT_ERROR_INVALID_OPERATION: + return "UR_RESULT_ERROR_INVALID_OPERATION"; + case UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES: + return "UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES"; + case UR_RESULT_ERROR_INVALID_QUEUE: + return "UR_RESULT_ERROR_INVALID_QUEUE"; + case UR_RESULT_ERROR_INVALID_VALUE: + return "UR_RESULT_ERROR_INVALID_VALUE"; + case UR_RESULT_ERROR_INVALID_CONTEXT: + return "UR_RESULT_ERROR_INVALID_CONTEXT"; + case UR_RESULT_ERROR_INVALID_PLATFORM: + return "UR_RESULT_ERROR_INVALID_PLATFORM"; + case UR_RESULT_ERROR_INVALID_BINARY: + return "UR_RESULT_ERROR_INVALID_BINARY"; + case UR_RESULT_ERROR_INVALID_PROGRAM: + return "UR_RESULT_ERROR_INVALID_PROGRAM"; + case UR_RESULT_ERROR_INVALID_SAMPLER: + return "UR_RESULT_ERROR_INVALID_SAMPLER"; + case UR_RESULT_ERROR_INVALID_BUFFER_SIZE: + return "UR_RESULT_ERROR_INVALID_BUFFER_SIZE"; + case UR_RESULT_ERROR_INVALID_MEM_OBJECT: + return "UR_RESULT_ERROR_INVALID_MEM_OBJECT"; + case UR_RESULT_ERROR_INVALID_EVENT: + return "UR_RESULT_ERROR_INVALID_EVENT"; + case UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: + return "UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST"; + case UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET: + return "UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET"; + case UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE: + return "UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE"; + case UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE: + return "UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE"; + case UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE: + return "UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE"; + case UR_RESULT_ERROR_DEVICE_NOT_FOUND: + return "UR_RESULT_ERROR_DEVICE_NOT_FOUND"; + case UR_RESULT_ERROR_INVALID_DEVICE: + return "UR_RESULT_ERROR_INVALID_DEVICE"; + case UR_RESULT_ERROR_DEVICE_LOST: + return "UR_RESULT_ERROR_DEVICE_LOST"; + case UR_RESULT_ERROR_DEVICE_REQUIRES_RESET: + return "UR_RESULT_ERROR_DEVICE_REQUIRES_RESET"; + case UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE: + return "UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE"; + case UR_RESULT_ERROR_DEVICE_PARTITION_FAILED: + return "UR_RESULT_ERROR_DEVICE_PARTITION_FAILED"; + case UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT: + return "UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT"; + case UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE: + return "UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE"; + case UR_RESULT_ERROR_INVALID_WORK_DIMENSION: + return "UR_RESULT_ERROR_INVALID_WORK_DIMENSION"; + case UR_RESULT_ERROR_INVALID_KERNEL_ARGS: + return "UR_RESULT_ERROR_INVALID_KERNEL_ARGS"; + case UR_RESULT_ERROR_INVALID_KERNEL: + return "UR_RESULT_ERROR_INVALID_KERNEL"; + case UR_RESULT_ERROR_INVALID_KERNEL_NAME: + return "UR_RESULT_ERROR_INVALID_KERNEL_NAME"; + case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX: + return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX"; + case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE: + return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE"; + case UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE: + return "UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE"; + case UR_RESULT_ERROR_INVALID_IMAGE_SIZE: + return "UR_RESULT_ERROR_INVALID_IMAGE_SIZE"; + case UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR: + return "UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR"; + case UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED: + return "UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED"; + case UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE: + return "UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE"; + case UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE: + return "UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE"; + case UR_RESULT_ERROR_UNINITIALIZED: + return "UR_RESULT_ERROR_UNINITIALIZED"; + case UR_RESULT_ERROR_OUT_OF_HOST_MEMORY: + return "UR_RESULT_ERROR_OUT_OF_HOST_MEMORY"; + case UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY: + return "UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY"; + case UR_RESULT_ERROR_OUT_OF_RESOURCES: + return "UR_RESULT_ERROR_OUT_OF_RESOURCES"; + case UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE: + return "UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE"; + case UR_RESULT_ERROR_PROGRAM_LINK_FAILURE: + return "UR_RESULT_ERROR_PROGRAM_LINK_FAILURE"; + case UR_RESULT_ERROR_UNSUPPORTED_VERSION: + return "UR_RESULT_ERROR_UNSUPPORTED_VERSION"; + case UR_RESULT_ERROR_UNSUPPORTED_FEATURE: + return "UR_RESULT_ERROR_UNSUPPORTED_FEATURE"; + case UR_RESULT_ERROR_INVALID_ARGUMENT: + return "UR_RESULT_ERROR_INVALID_ARGUMENT"; + case UR_RESULT_ERROR_INVALID_NULL_HANDLE: + return "UR_RESULT_ERROR_INVALID_NULL_HANDLE"; + case UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE: + return "UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE"; + case UR_RESULT_ERROR_INVALID_NULL_POINTER: + return "UR_RESULT_ERROR_INVALID_NULL_POINTER"; + case UR_RESULT_ERROR_INVALID_SIZE: + return "UR_RESULT_ERROR_INVALID_SIZE"; + case UR_RESULT_ERROR_UNSUPPORTED_SIZE: + return "UR_RESULT_ERROR_UNSUPPORTED_SIZE"; + case UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT: + return "UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT"; + case UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT: + return "UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT"; + case UR_RESULT_ERROR_INVALID_ENUMERATION: + return "UR_RESULT_ERROR_INVALID_ENUMERATION"; + case UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION: + return "UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION"; + case UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT: + return "UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT"; + case UR_RESULT_ERROR_INVALID_NATIVE_BINARY: + return "UR_RESULT_ERROR_INVALID_NATIVE_BINARY"; + case UR_RESULT_ERROR_INVALID_GLOBAL_NAME: + return "UR_RESULT_ERROR_INVALID_GLOBAL_NAME"; + case UR_RESULT_ERROR_INVALID_FUNCTION_NAME: + return "UR_RESULT_ERROR_INVALID_FUNCTION_NAME"; + case UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION: + return "UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION"; + case UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION: + return "UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION"; + case UR_RESULT_ERROR_PROGRAM_UNLINKED: + return "UR_RESULT_ERROR_PROGRAM_UNLINKED"; + case UR_RESULT_ERROR_OVERLAPPING_REGIONS: + return "UR_RESULT_ERROR_OVERLAPPING_REGIONS"; + case UR_RESULT_ERROR_INVALID_HOST_PTR: + return "UR_RESULT_ERROR_INVALID_HOST_PTR"; + case UR_RESULT_ERROR_INVALID_USM_SIZE: + return "UR_RESULT_ERROR_INVALID_USM_SIZE"; + case UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE: + return "UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE"; + case UR_RESULT_ERROR_ADAPTER_SPECIFIC: + return "UR_RESULT_ERROR_ADAPTER_SPECIFIC"; + default: + return "UR_RESULT_ERROR_UNKNOWN"; + } +} + +// Trace an internal UR call +#define UR_TRACE(Call) \ + { \ + ur_result_t Result; \ + UR_CALL(Call, Result); \ + } + +// Trace an internal UR call and return the result to the user. +#define UR_CALL(Call, Result) \ + { \ + if (PrintTrace) \ + fprintf(stderr, "UR ---> %s\n", #Call); \ + Result = (Call); \ + if (PrintTrace) \ + fprintf(stderr, "UR <--- %s(%s)\n", #Call, getUrResultString(Result)); \ + } + +struct ur_exp_command_buffer_handle_t_ { + + ur_exp_command_buffer_handle_t_(ur_context_handle_t hContext, + ur_device_handle_t hDevice); + + ~ur_exp_command_buffer_handle_t_(); + + void RegisterSyncPoint(ur_exp_command_buffer_sync_point_t SyncPoint, + std::shared_ptr CuNode) { + SyncPoints[SyncPoint] = CuNode; + NextSyncPoint++; + } + + ur_exp_command_buffer_sync_point_t GetNextSyncPoint() const { + return NextSyncPoint; + } + + // Helper to register next sync point + // @param CuNode Node to register as next sycn point + // @return Pointer to the sync that registers the Node + ur_exp_command_buffer_sync_point_t + AddSyncPoint(std::shared_ptr CuNode) { + ur_exp_command_buffer_sync_point_t SyncPoint = NextSyncPoint; + RegisterSyncPoint(SyncPoint, CuNode); + return SyncPoint; + } + + // UR context associated with this command-buffer + ur_context_handle_t Context; + // Device associated with this command buffer + ur_device_handle_t Device; + // Cuda Graph handle + CUgraph CudaGraph; + // Cuda Graph Exec handle + CUgraphExec CudaGraphExec; + // Atomic variable counting the number of reference to this command_buffer + // using std::atomic prevents data race when incrementing/decrementing. + std::atomic_uint32_t RefCount; + + // Map of sync_points to ur_events + std::unordered_map> + SyncPoints; + // Next sync_point value (may need to consider ways to reuse values if 32-bits + // is not enough) + ur_exp_command_buffer_sync_point_t NextSyncPoint; + + // Used when retaining an object. + uint32_t incrementReferenceCount() noexcept { return ++RefCount; } + // Used when releasing an object. + uint32_t decrementReferenceCount() noexcept { return --RefCount; } + uint32_t getReferenceCount() const noexcept { return RefCount; } +}; diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp index 0c00210eb2..76984ca744 100644 --- a/source/adapters/cuda/device.cpp +++ b/source/adapters/cuda/device.cpp @@ -613,6 +613,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, std::string SupportedExtensions = "cl_khr_fp64 cl_khr_subgroups "; SupportedExtensions += "pi_ext_intel_devicelib_assert "; + // Return supported for the UR command-buffer experimental feature + SupportedExtensions += "ur_exp_command_buffer "; SupportedExtensions += " "; int Major = 0; diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 1022be1f09..6e6515908e 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -187,6 +187,146 @@ bool hasExceededMaxRegistersPerBlock(ur_device_handle_t Device, return BlockSize * Kernel->getRegsPerThread() > Device->getMaxRegsPerBlock(); } +// Helper to compute kernel parameters from workload +// dimensions. +// @param [in] Context handler to the target Context +// @param [in] Device handler to the target Device +// @param [in] WorkDim workload dimension +// @param [in] GlobalWorkOffset pointer workload global offsets +// @param [in] LocalWorkOffset pointer workload local offsets +// @param [inout] Kernel handler to the kernel +// @param [inout] CuFunc handler to the cuda function attached to the kernel +// @param [out] ThreadsPerBlock Number of threads per block we should run +// @param [out] BlocksPerGrid Number of blocks per grid we should run +ur_result_t +setKernelParams(const ur_context_handle_t Context, + const ur_device_handle_t Device, const uint32_t WorkDim, + const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, + const size_t *LocalWorkSize, ur_kernel_handle_t &Kernel, + CUfunction &CuFunc, size_t (&ThreadsPerBlock)[3], + size_t (&BlocksPerGrid)[3]) { + ur_result_t Result = UR_RESULT_SUCCESS; + size_t MaxWorkGroupSize = 0u; + size_t MaxThreadsPerBlock[3] = {}; + bool ProvidedLocalWorkGroupSize = LocalWorkSize != nullptr; + uint32_t LocalSize = Kernel->getLocalSize(); + + try { + // Set the active context here as guessLocalWorkSize needs an active context + ScopedContext Active(Context); + { + size_t *ReqdThreadsPerBlock = Kernel->ReqdThreadsPerBlock; + MaxWorkGroupSize = Device->getMaxWorkGroupSize(); + Device->getMaxWorkItemSizes(sizeof(MaxThreadsPerBlock), + MaxThreadsPerBlock); + + if (ProvidedLocalWorkGroupSize) { + auto IsValid = [&](int Dim) { + if (ReqdThreadsPerBlock[Dim] != 0 && + LocalWorkSize[Dim] != ReqdThreadsPerBlock[Dim]) + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + + if (LocalWorkSize[Dim] > MaxThreadsPerBlock[Dim]) + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + // Checks that local work sizes are a divisor of the global work sizes + // which includes that the local work sizes are neither larger than + // the global work sizes and not 0. + if (0u == LocalWorkSize[Dim]) + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + if (0u != (GlobalWorkSize[Dim] % LocalWorkSize[Dim])) + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + ThreadsPerBlock[Dim] = LocalWorkSize[Dim]; + return UR_RESULT_SUCCESS; + }; + + size_t KernelLocalWorkGroupSize = 0; + for (size_t Dim = 0; Dim < WorkDim; Dim++) { + auto Err = IsValid(Dim); + if (Err != UR_RESULT_SUCCESS) + return Err; + // If no error then sum the total local work size per dim. + KernelLocalWorkGroupSize += LocalWorkSize[Dim]; + } + + if (hasExceededMaxRegistersPerBlock(Device, Kernel, + KernelLocalWorkGroupSize)) { + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + } else { + guessLocalWorkSize(Device, ThreadsPerBlock, GlobalWorkSize, WorkDim, + MaxThreadsPerBlock, Kernel, LocalSize); + } + } + + if (MaxWorkGroupSize < + ThreadsPerBlock[0] * ThreadsPerBlock[1] * ThreadsPerBlock[2]) { + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + + for (size_t i = 0; i < WorkDim; i++) { + BlocksPerGrid[i] = + (GlobalWorkSize[i] + ThreadsPerBlock[i] - 1) / ThreadsPerBlock[i]; + } + + // Set the implicit global offset parameter if kernel has offset variant + if (Kernel->get_with_offset_parameter()) { + std::uint32_t CudaImplicitOffset[3] = {0, 0, 0}; + if (GlobalWorkOffset) { + for (size_t i = 0; i < WorkDim; i++) { + CudaImplicitOffset[i] = + static_cast(GlobalWorkOffset[i]); + if (GlobalWorkOffset[i] != 0) { + CuFunc = Kernel->get_with_offset_parameter(); + } + } + } + Kernel->setImplicitOffsetArg(sizeof(CudaImplicitOffset), + CudaImplicitOffset); + } + + if (Context->getDevice()->maxLocalMemSizeChosen()) { + // Set up local memory requirements for kernel. + auto Device = Context->getDevice(); + if (Device->getMaxChosenLocalMem() < 0) { + bool EnvVarHasURPrefix = + std::getenv("UR_CUDA_MAX_LOCAL_MEM_SIZE") != nullptr; + setErrorMessage(EnvVarHasURPrefix ? "Invalid value specified for " + "UR_CUDA_MAX_LOCAL_MEM_SIZE" + : "Invalid value specified for " + "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE", + UR_RESULT_ERROR_ADAPTER_SPECIFIC); + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } + if (LocalSize > static_cast(Device->getMaxCapacityLocalMem())) { + setErrorMessage("Too much local memory allocated for device", + UR_RESULT_ERROR_ADAPTER_SPECIFIC); + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } + if (LocalSize > static_cast(Device->getMaxChosenLocalMem())) { + bool EnvVarHasURPrefix = + std::getenv("UR_CUDA_MAX_LOCAL_MEM_SIZE") != nullptr; + setErrorMessage( + EnvVarHasURPrefix + ? "Local memory for kernel exceeds the amount requested using " + "UR_CUDA_MAX_LOCAL_MEM_SIZE. Try increasing the value of " + "UR_CUDA_MAX_LOCAL_MEM_SIZE." + : "Local memory for kernel exceeds the amount requested using " + "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE. Try increasing the the " + "value of SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE.", + UR_RESULT_ERROR_ADAPTER_SPECIFIC); + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } + UR_CHECK_ERROR(cuFuncSetAttribute( + CuFunc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, + Device->getMaxChosenLocalMem())); + } + + } catch (ur_result_t Err) { + Result = Err; + } + return Result; +} + /// Enqueues a wait on the given CUstream for all specified events (See /// \ref enqueueEventWaitWithBarrier.) If the events list is empty, the enqueued /// wait will wait on all previous events in the queue. @@ -291,100 +431,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( // Set the number of threads per block to the number of threads per warp // by default unless user has provided a better number size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; - size_t MaxWorkGroupSize = 0u; - size_t MaxThreadsPerBlock[3] = {}; - bool ProvidedLocalWorkGroupSize = (pLocalWorkSize != nullptr); + size_t BlocksPerGrid[3] = {1u, 1u, 1u}; + uint32_t LocalSize = hKernel->getLocalSize(); ur_result_t Result = UR_RESULT_SUCCESS; + CUfunction CuFunc = hKernel->get(); - try { - // Set the active context here as guessLocalWorkSize needs an active context - ScopedContext Active(hQueue->getContext()); - { - size_t *ReqdThreadsPerBlock = hKernel->ReqdThreadsPerBlock; - MaxWorkGroupSize = hQueue->Device->getMaxWorkGroupSize(); - hQueue->Device->getMaxWorkItemSizes(sizeof(MaxThreadsPerBlock), - MaxThreadsPerBlock); - - if (ProvidedLocalWorkGroupSize) { - auto IsValid = [&](int Dim) { - if (ReqdThreadsPerBlock[Dim] != 0 && - pLocalWorkSize[Dim] != ReqdThreadsPerBlock[Dim]) - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - - if (pLocalWorkSize[Dim] > MaxThreadsPerBlock[Dim]) - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - // Checks that local work sizes are a divisor of the global work sizes - // which includes that the local work sizes are neither larger than - // the global work sizes and not 0. - if (0u == pLocalWorkSize[Dim]) - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - if (0u != (pGlobalWorkSize[Dim] % pLocalWorkSize[Dim])) - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - ThreadsPerBlock[Dim] = pLocalWorkSize[Dim]; - return UR_RESULT_SUCCESS; - }; - - size_t KernelLocalWorkGroupSize = 0; - for (size_t Dim = 0; Dim < workDim; Dim++) { - auto Err = IsValid(Dim); - if (Err != UR_RESULT_SUCCESS) - return Err; - // If no error then sum the total local work size per dim. - KernelLocalWorkGroupSize += pLocalWorkSize[Dim]; - } - - if (hasExceededMaxRegistersPerBlock(hQueue->Device, hKernel, - KernelLocalWorkGroupSize)) { - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - } - } else { - guessLocalWorkSize(hQueue->Device, ThreadsPerBlock, pGlobalWorkSize, - workDim, MaxThreadsPerBlock, hKernel, LocalSize); - } - } - - if (MaxWorkGroupSize < - ThreadsPerBlock[0] * ThreadsPerBlock[1] * ThreadsPerBlock[2]) { - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - } - - size_t BlocksPerGrid[3] = {1u, 1u, 1u}; - - for (size_t i = 0; i < workDim; i++) { - BlocksPerGrid[i] = - (pGlobalWorkSize[i] + ThreadsPerBlock[i] - 1) / ThreadsPerBlock[i]; - } + Result = setKernelParams(hQueue->getContext(), hQueue->Device, workDim, + pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid); + if (Result != UR_RESULT_SUCCESS) { + return Result; + } + try { std::unique_ptr RetImplEvent{nullptr}; uint32_t StreamToken; ur_stream_guard_ Guard; CUstream CuStream = hQueue->getNextComputeStream( numEventsInWaitList, phEventWaitList, Guard, &StreamToken); - CUfunction CuFunc = hKernel->get(); Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, phEventWaitList); - // Set the implicit global offset parameter if kernel has offset variant - if (hKernel->get_with_offset_parameter()) { - std::uint32_t CudaImplicitOffset[3] = {0, 0, 0}; - if (pGlobalWorkOffset) { - for (size_t i = 0; i < workDim; i++) { - CudaImplicitOffset[i] = - static_cast(pGlobalWorkOffset[i]); - if (pGlobalWorkOffset[i] != 0) { - CuFunc = hKernel->get_with_offset_parameter(); - } - } - } - hKernel->setImplicitOffsetArg(sizeof(CudaImplicitOffset), - CudaImplicitOffset); - } - - auto &ArgIndices = hKernel->getArgIndices(); - if (phEvent) { RetImplEvent = std::unique_ptr(ur_event_handle_t_::makeNative( @@ -392,47 +462,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( UR_CHECK_ERROR(RetImplEvent->start()); } - if (hQueue->getContext()->getDevice()->maxLocalMemSizeChosen()) { - // Set up local memory requirements for kernel. - auto Device = hQueue->getContext()->getDevice(); - if (Device->getMaxChosenLocalMem() < 0) { - bool EnvVarHasURPrefix = - (std::getenv("UR_CUDA_MAX_LOCAL_MEM_SIZE") != nullptr); - setErrorMessage(EnvVarHasURPrefix ? "Invalid value specified for " - "UR_CUDA_MAX_LOCAL_MEM_SIZE" - : "Invalid value specified for " - "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE", - UR_RESULT_ERROR_ADAPTER_SPECIFIC); - return UR_RESULT_ERROR_ADAPTER_SPECIFIC; - } - if (LocalSize > static_cast(Device->getMaxCapacityLocalMem())) { - setErrorMessage("Too much local memory allocated for device", - UR_RESULT_ERROR_ADAPTER_SPECIFIC); - return UR_RESULT_ERROR_ADAPTER_SPECIFIC; - } - if (LocalSize > static_cast(Device->getMaxChosenLocalMem())) { - bool EnvVarHasURPrefix = - (std::getenv("UR_CUDA_MAX_LOCAL_MEM_SIZE") != nullptr); - setErrorMessage( - EnvVarHasURPrefix - ? "Local memory for kernel exceeds the amount requested using " - "UR_CUDA_MAX_LOCAL_MEM_SIZE. Try increasing the value of " - "UR_CUDA_MAX_LOCAL_MEM_SIZE." - : "Local memory for kernel exceeds the amount requested using " - "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE. Try increasing the the " - "value of SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE.", - UR_RESULT_ERROR_ADAPTER_SPECIFIC); - return UR_RESULT_ERROR_ADAPTER_SPECIFIC; - } - UR_CHECK_ERROR(cuFuncSetAttribute( - CuFunc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, - Device->getMaxChosenLocalMem())); - } - + auto &ArgIndices = hKernel->getArgIndices(); UR_CHECK_ERROR(cuLaunchKernel( CuFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2], ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], LocalSize, CuStream, const_cast(ArgIndices.data()), nullptr)); + if (LocalSize != 0) hKernel->clearLocalSize(); @@ -440,29 +475,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( UR_CHECK_ERROR(RetImplEvent->record()); *phEvent = RetImplEvent.release(); } + } catch (ur_result_t Err) { Result = Err; } return Result; } -/// General 3D memory copy operation. -/// This function requires the corresponding CUDA context to be at the top of -/// the context stack +/// Set parameters for general 3D memory copy. /// If the source and/or destination is on the device, SrcPtr and/or DstPtr /// must be a pointer to a CUdeviceptr -static ur_result_t commonEnqueueMemBufferCopyRect( - CUstream cu_stream, ur_rect_region_t region, const void *SrcPtr, - const CUmemorytype_enum SrcType, ur_rect_offset_t src_offset, - size_t src_row_pitch, size_t src_slice_pitch, void *DstPtr, - const CUmemorytype_enum DstType, ur_rect_offset_t dst_offset, - size_t dst_row_pitch, size_t dst_slice_pitch) { - - UR_ASSERT(SrcType == CU_MEMORYTYPE_DEVICE || SrcType == CU_MEMORYTYPE_HOST, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(DstType == CU_MEMORYTYPE_DEVICE || DstType == CU_MEMORYTYPE_HOST, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); - +void setCopyRectParams(ur_rect_region_t region, const void *SrcPtr, + const CUmemorytype_enum SrcType, + ur_rect_offset_t src_offset, size_t src_row_pitch, + size_t src_slice_pitch, void *DstPtr, + const CUmemorytype_enum DstType, + ur_rect_offset_t dst_offset, size_t dst_row_pitch, + size_t dst_slice_pitch, CUDA_MEMCPY3D ¶ms) { src_row_pitch = (!src_row_pitch) ? region.width + src_offset.x : src_row_pitch; src_slice_pitch = (!src_slice_pitch) @@ -474,8 +503,6 @@ static ur_result_t commonEnqueueMemBufferCopyRect( ? ((region.height + dst_offset.y) * dst_row_pitch) : dst_slice_pitch; - CUDA_MEMCPY3D params = {}; - params.WidthInBytes = region.width; params.Height = region.height; params.Depth = region.depth; @@ -500,6 +527,29 @@ static ur_result_t commonEnqueueMemBufferCopyRect( params.dstZ = dst_offset.z; params.dstPitch = dst_row_pitch; params.dstHeight = dst_slice_pitch / dst_row_pitch; +} + +/// General 3D memory copy operation. +/// This function requires the corresponding CUDA context to be at the top of +/// the context stack +/// If the source and/or destination is on the device, SrcPtr and/or DstPtr +/// must be a pointer to a CUdeviceptr +static ur_result_t commonEnqueueMemBufferCopyRect( + CUstream cu_stream, ur_rect_region_t region, const void *SrcPtr, + const CUmemorytype_enum SrcType, ur_rect_offset_t src_offset, + size_t src_row_pitch, size_t src_slice_pitch, void *DstPtr, + const CUmemorytype_enum DstType, ur_rect_offset_t dst_offset, + size_t dst_row_pitch, size_t dst_slice_pitch) { + UR_ASSERT(SrcType == CU_MEMORYTYPE_DEVICE || SrcType == CU_MEMORYTYPE_HOST, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(DstType == CU_MEMORYTYPE_DEVICE || DstType == CU_MEMORYTYPE_HOST, + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + CUDA_MEMCPY3D params = {}; + + setCopyRectParams(region, SrcPtr, SrcType, src_offset, src_row_pitch, + src_slice_pitch, DstPtr, DstType, dst_offset, dst_row_pitch, + dst_slice_pitch, params); UR_CHECK_ERROR(cuMemcpy3DAsync(¶ms, cu_stream)); diff --git a/source/adapters/cuda/enqueue.hpp b/source/adapters/cuda/enqueue.hpp index 7226a5536b..64c590f742 100644 --- a/source/adapters/cuda/enqueue.hpp +++ b/source/adapters/cuda/enqueue.hpp @@ -16,3 +16,28 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream, uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList); + +void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock, + const size_t *GlobalWorkSize, const uint32_t WorkDim, + const size_t MaxThreadsPerBlock[3], + ur_kernel_handle_t Kernel, uint32_t LocalSize); + +bool hasExceededMaxRegistersPerBlock(ur_device_handle_t Device, + ur_kernel_handle_t Kernel, + size_t BlockSize); + +ur_result_t +setKernelParams(const ur_context_handle_t Context, + const ur_device_handle_t Device, const uint32_t WorkDim, + const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, + const size_t *LocalWorkSize, ur_kernel_handle_t &Kernel, + CUfunction &CuFunc, size_t (&ThreadsPerBlock)[3], + size_t (&BlocksPerGrid)[3]); + +void setCopyRectParams(ur_rect_region_t region, const void *SrcPtr, + const CUmemorytype_enum SrcType, + ur_rect_offset_t src_offset, size_t src_row_pitch, + size_t src_slice_pitch, void *DstPtr, + const CUmemorytype_enum DstType, + ur_rect_offset_t dst_offset, size_t dst_row_pitch, + size_t dst_slice_pitch, CUDA_MEMCPY3D ¶ms);