diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_queue_priority.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_queue_priority.asciidoc index 34d6319844061..b1158d7b4ada8 100644 --- a/sycl/doc/extensions/supported/sycl_ext_oneapi_queue_priority.asciidoc +++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_queue_priority.asciidoc @@ -45,7 +45,8 @@ This extension is implemented and fully supported by {dpcpp}. [NOTE] ==== Although {dpcpp} supports this extension on all backends, it is currently used -only on Level Zero. Other backends ignore the properties defined in this specification. +only on Level Zero and CUDA. +Other backends ignore the properties defined in this specification. ==== == Overview @@ -104,4 +105,4 @@ properties are hints and may safely be ignored by an implementation. It is illegal to specify multiple differrent priority hints for the same queue. Doing so causes the `queue` constructor to throw a synchronous `exception` with -the `errc::invalid` error code. \ No newline at end of file +the `errc::invalid` error code. diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp index ed356275fe84c..2a3d18994991c 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp @@ -40,8 +40,8 @@ CUstream ur_queue_handle_t_::getNextComputeStream(uint32_t *StreamToken) { // The second check is done after mutex is locked so other threads can not // change NumComputeStreams after that if (NumComputeStreams < ComputeStreams.size()) { - UR_CHECK_ERROR( - cuStreamCreate(&ComputeStreams[NumComputeStreams++], Flags)); + UR_CHECK_ERROR(cuStreamCreateWithPriority( + &ComputeStreams[NumComputeStreams++], Flags, Priority)); } } Token = ComputeStreamIndex++; @@ -101,8 +101,8 @@ CUstream ur_queue_handle_t_::getNextTransferStream() { // The second check is done after mutex is locked so other threads can not // change NumTransferStreams after that if (NumTransferStreams < TransferStreams.size()) { - UR_CHECK_ERROR( - cuStreamCreate(&TransferStreams[NumTransferStreams++], Flags)); + UR_CHECK_ERROR(cuStreamCreateWithPriority( + &TransferStreams[NumTransferStreams++], Flags, Priority)); } } uint32_t StreamI = TransferStreamIndex++ % TransferStreams.size(); @@ -128,6 +128,8 @@ urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice, unsigned int Flags = CU_STREAM_NON_BLOCKING; ur_queue_flags_t URFlags = 0; + // '0' is the default priority, per CUDA Toolkit 12.2 and earlier + int Priority = 0; bool IsOutOfOrder = false; if (pProps && pProps->stype == UR_STRUCTURE_TYPE_QUEUE_PROPERTIES) { URFlags = pProps->flags; @@ -140,6 +142,13 @@ urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice, if (URFlags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) { IsOutOfOrder = true; } + if (URFlags & UR_QUEUE_FLAG_PRIORITY_HIGH) { + ScopedContext Active(hContext); + UR_CHECK_ERROR(cuCtxGetStreamPriorityRange(nullptr, &Priority)); + } else if (URFlags & UR_QUEUE_FLAG_PRIORITY_LOW) { + ScopedContext Active(hContext); + UR_CHECK_ERROR(cuCtxGetStreamPriorityRange(&Priority, nullptr)); + } } std::vector ComputeCuStreams( @@ -149,7 +158,7 @@ urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice, Queue = std::unique_ptr(new ur_queue_handle_t_{ std::move(ComputeCuStreams), std::move(TransferCuStreams), hContext, - hDevice, Flags, URFlags}); + hDevice, Flags, URFlags, Priority}); *phQueue = Queue.release(); diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp index f76124bb713eb..4f2721b13aed6 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp @@ -47,6 +47,7 @@ struct ur_queue_handle_t_ { unsigned int LastSyncTransferStreams; unsigned int Flags; ur_queue_flags_t URFlags; + int Priority; // When ComputeStreamSyncMutex and ComputeStreamMutex both need to be // locked at the same time, ComputeStreamSyncMutex should be locked first // to avoid deadlocks @@ -59,7 +60,7 @@ struct ur_queue_handle_t_ { ur_queue_handle_t_(std::vector &&ComputeStreams, std::vector &&TransferStreams, ur_context_handle_t_ *Context, ur_device_handle_t_ *Device, - unsigned int Flags, ur_queue_flags_t URFlags, + unsigned int Flags, ur_queue_flags_t URFlags, int Priority, bool BackendOwns = true) : ComputeStreams{std::move(ComputeStreams)}, TransferStreams{std::move(TransferStreams)}, @@ -69,7 +70,7 @@ struct ur_queue_handle_t_ { Device{Device}, RefCount{1}, EventCount{0}, ComputeStreamIndex{0}, TransferStreamIndex{0}, NumComputeStreams{0}, NumTransferStreams{0}, LastSyncComputeStreams{0}, LastSyncTransferStreams{0}, Flags(Flags), - URFlags(URFlags), HasOwnership{BackendOwns} { + URFlags(URFlags), Priority(Priority), HasOwnership{BackendOwns} { urContextRetain(Context); urDeviceRetain(Device); } diff --git a/sycl/test-e2e/Plugin/cuda_queue_priority.cpp b/sycl/test-e2e/Plugin/cuda_queue_priority.cpp new file mode 100644 index 0000000000000..c9e69ae6d3256 --- /dev/null +++ b/sycl/test-e2e/Plugin/cuda_queue_priority.cpp @@ -0,0 +1,47 @@ +// REQUIRES: gpu, cuda + +// RUN: %{build} %cuda_options -o %t.out +// RUN: %{run} %t.out +// +// Check that queue priority is passed to CUDA runtime +#define SYCL_EXT_ONEAPI_BACKEND_CUDA_EXPERIMENTAL 1 +#include +#include + +#include + +#include + +int get_real_priority(sycl::context &C, sycl::device &D, + sycl::property_list Props) { + sycl::queue Q(C, D, Props); + CUstream QNative = sycl::get_native(Q); + int P; + CUresult Result = cuStreamGetPriority(QNative, &P); + assert(Result == CUDA_SUCCESS && "cuStreamGetPriority call failed"); + return P; +} + +int main(int Argc, const char *Argv[]) { + sycl::device D; + sycl::context C{D}; + + int PrioDefault = get_real_priority(C, D, sycl::property_list{}); + int PrioNormal = get_real_priority( + C, D, {sycl::ext::oneapi::property::queue::priority_normal{}}); + int PrioHigh = get_real_priority( + C, D, {sycl::ext::oneapi::property::queue::priority_high{}}); + int PrioLow = get_real_priority( + C, D, {sycl::ext::oneapi::property::queue::priority_low{}}); + // Lower value means higher priority + assert(PrioDefault == PrioNormal && + "priority_normal is not the same as default"); + assert(PrioHigh <= PrioNormal && + "priority_high is lower than priority_normal"); + assert(PrioLow >= PrioNormal && + "priority_low is higher than priority_normal"); + assert(PrioLow > PrioHigh && "priority_low is the same as priority_high"); + + std::cout << "The test passed." << std::endl; + return 0; +}