intel · steffenlarsen · Oct 2, 2023 · Sep 25, 2023 · Sep 25, 2023 · Sep 28, 2023
@@ -45,7 +45,8 @@ This extension is implemented and fully supported by {dpcpp}.
 [NOTE]
 ====
 Although {dpcpp} supports this extension on all backends, it is currently used
-only on Level Zero. Other backends ignore the properties defined in this specification.
+only on Level Zero and CUDA.
+Other backends ignore the properties defined in this specification.
 ====
 
 == Overview
@@ -104,4 +105,4 @@ properties are hints and may safely be ignored by an implementation.
 
 It is illegal to specify multiple differrent priority hints for the same queue.
 Doing so causes the `queue` constructor to throw a synchronous `exception` with
-the `errc::invalid` error code.
+the `errc::invalid` error code.
@@ -40,8 +40,8 @@ CUstream ur_queue_handle_t_::getNextComputeStream(uint32_t *StreamToken) {
       // The second check is done after mutex is locked so other threads can not
       // change NumComputeStreams after that
       if (NumComputeStreams < ComputeStreams.size()) {
-        UR_CHECK_ERROR(
-            cuStreamCreate(&ComputeStreams[NumComputeStreams++], Flags));
+        UR_CHECK_ERROR(cuStreamCreateWithPriority(
+            &ComputeStreams[NumComputeStreams++], Flags, Priority));
       }
     }
     Token = ComputeStreamIndex++;
@@ -101,8 +101,8 @@ CUstream ur_queue_handle_t_::getNextTransferStream() {
     // The second check is done after mutex is locked so other threads can not
     // change NumTransferStreams after that
     if (NumTransferStreams < TransferStreams.size()) {
-      UR_CHECK_ERROR(
-          cuStreamCreate(&TransferStreams[NumTransferStreams++], Flags));
+      UR_CHECK_ERROR(cuStreamCreateWithPriority(
+          &TransferStreams[NumTransferStreams++], Flags, Priority));
     }
   }
   uint32_t StreamI = TransferStreamIndex++ % TransferStreams.size();
@@ -128,6 +128,8 @@ urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice,
 
     unsigned int Flags = CU_STREAM_NON_BLOCKING;
     ur_queue_flags_t URFlags = 0;
+    // '0' is the default priority, per CUDA Toolkit 12.2 and earlier
+    int Priority = 0;
     bool IsOutOfOrder = false;
     if (pProps && pProps->stype == UR_STRUCTURE_TYPE_QUEUE_PROPERTIES) {
       URFlags = pProps->flags;
@@ -140,6 +142,13 @@ urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice,
       if (URFlags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) {
         IsOutOfOrder = true;
       }
+      if (URFlags & UR_QUEUE_FLAG_PRIORITY_HIGH) {
+        ScopedContext Active(hContext);
+        UR_CHECK_ERROR(cuCtxGetStreamPriorityRange(nullptr, &Priority));
+      } else if (URFlags & UR_QUEUE_FLAG_PRIORITY_LOW) {
+        ScopedContext Active(hContext);
+        UR_CHECK_ERROR(cuCtxGetStreamPriorityRange(&Priority, nullptr));
+      }
     }
 
     std::vector<CUstream> ComputeCuStreams(
@@ -149,7 +158,7 @@ urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice,
 
     Queue = std::unique_ptr<ur_queue_handle_t_>(new ur_queue_handle_t_{
         std::move(ComputeCuStreams), std::move(TransferCuStreams), hContext,
-        hDevice, Flags, URFlags});
+        hDevice, Flags, URFlags, Priority});
 
     *phQueue = Queue.release();
 

@@ -47,6 +47,7 @@ struct ur_queue_handle_t_ {
   unsigned int LastSyncTransferStreams;
   unsigned int Flags;
   ur_queue_flags_t URFlags;
+  int Priority;
   // When ComputeStreamSyncMutex and ComputeStreamMutex both need to be
   // locked at the same time, ComputeStreamSyncMutex should be locked first
   // to avoid deadlocks
@@ -59,7 +60,7 @@ struct ur_queue_handle_t_ {
   ur_queue_handle_t_(std::vector<CUstream> &&ComputeStreams,
                      std::vector<CUstream> &&TransferStreams,
                      ur_context_handle_t_ *Context, ur_device_handle_t_ *Device,
-                     unsigned int Flags, ur_queue_flags_t URFlags,
+                     unsigned int Flags, ur_queue_flags_t URFlags, int Priority,
                      bool BackendOwns = true)
       : ComputeStreams{std::move(ComputeStreams)},
         TransferStreams{std::move(TransferStreams)},
@@ -69,7 +70,7 @@ struct ur_queue_handle_t_ {
         Device{Device}, RefCount{1}, EventCount{0}, ComputeStreamIndex{0},
         TransferStreamIndex{0}, NumComputeStreams{0}, NumTransferStreams{0},
         LastSyncComputeStreams{0}, LastSyncTransferStreams{0}, Flags(Flags),
-        URFlags(URFlags), HasOwnership{BackendOwns} {
+        URFlags(URFlags), Priority(Priority), HasOwnership{BackendOwns} {
     urContextRetain(Context);
     urDeviceRetain(Device);
   }

@@ -0,0 +1,47 @@
+// REQUIRES: gpu, cuda
+
+// RUN: %{build} %cuda_options -o %t.out
+// RUN: %{run} %t.out
+//
+// Check that queue priority is passed to CUDA runtime
+#define SYCL_EXT_ONEAPI_BACKEND_CUDA_EXPERIMENTAL 1
+#include <sycl/ext/oneapi/experimental/backend/cuda.hpp>
+#include <sycl/sycl.hpp>
+
+#include <cuda.h>
+
+#include <assert.h>
+
+int get_real_priority(sycl::context &C, sycl::device &D,
+                      sycl::property_list Props) {
+  sycl::queue Q(C, D, Props);
+  CUstream QNative = sycl::get_native<sycl::backend::ext_oneapi_cuda>(Q);
+  int P;
+  CUresult Result = cuStreamGetPriority(QNative, &P);
+  assert(Result == CUDA_SUCCESS && "cuStreamGetPriority call failed");
+  return P;
+}
+
+int main(int Argc, const char *Argv[]) {
+  sycl::device D;
+  sycl::context C{D};
+
+  int PrioDefault = get_real_priority(C, D, sycl::property_list{});
+  int PrioNormal = get_real_priority(
+      C, D, {sycl::ext::oneapi::property::queue::priority_normal{}});
+  int PrioHigh = get_real_priority(
+      C, D, {sycl::ext::oneapi::property::queue::priority_high{}});
+  int PrioLow = get_real_priority(
+      C, D, {sycl::ext::oneapi::property::queue::priority_low{}});
+  // Lower value means higher priority
+  assert(PrioDefault == PrioNormal &&
+         "priority_normal is not the same as default");
+  assert(PrioHigh <= PrioNormal &&
+         "priority_high is lower than priority_normal");
+  assert(PrioLow >= PrioNormal &&
+         "priority_low is higher than priority_normal");
+  assert(PrioLow > PrioHigh && "priority_low is the same as priority_high");
+
+  std::cout << "The test passed." << std::endl;
+  return 0;
+}