From ea89d42d4af42cbb38959d70ea691741c67220eb Mon Sep 17 00:00:00 2001
From: Vladimir Lazarev <vladimir.lazarev@intel.com>
Date: Thu, 2 Apr 2020 16:44:19 +0300
Subject: [PATCH] [SYCL] Enable LIT testing with CUDA BE

Signed-off-by: Vladimir Lazarev <vladimir.lazarev@intel.com>
---
 sycl/test/backend/cuda/primary_context.cpp    | 116 ---------------
 sycl/test/basic_tests/boolean.cpp             |   2 -
 .../basic_tests/buffer/buffer_full_copy.cpp   |   3 -
 sycl/test/group-algorithm/all_of.cpp          |   3 +-
 sycl/test/group-algorithm/any_of.cpp          |   3 +-
 sycl/test/group-algorithm/broadcast.cpp       |   3 +-
 sycl/test/group-algorithm/exclusive_scan.cpp  |   3 +-
 sycl/test/group-algorithm/inclusive_scan.cpp  |   3 +-
 sycl/test/group-algorithm/leader.cpp          |   2 +-
 sycl/test/group-algorithm/none_of.cpp         |   3 +-
 sycl/test/group-algorithm/reduce.cpp          |   3 +-
 sycl/test/hier_par/hier_par_basic.cpp         |   3 -
 sycl/test/kernel_from_file/hw.cpp             |   3 -
 sycl/test/lit.cfg.py                          |   1 +
 sycl/test/multi_ptr/multi_ptr.cpp             |   2 +-
 sycl/test/program_manager/env_vars.cpp        |   3 +-
 .../regression/private_array_init_test.cpp    |   2 +-
 sycl/test/regression/static-buffer-dtor.cpp   |   1 -
 sycl/test/scheduler/DataMovement.cpp          |   6 +-
 sycl/test/usm/allocator_vector.cpp            |   1 +
 sycl/test/usm/allocator_vector_fail.cpp       |   1 +
 sycl/test/usm/allocatorll.cpp                 |   1 +
 sycl/test/usm/badmalloc.cpp                   |   4 +-
 sycl/test/usm/depends_on.cpp                  |   1 +
 sycl/test/usm/dmemll.cpp                      |   1 +
 sycl/test/usm/hmemll.cpp                      |   1 +
 sycl/test/usm/math.cpp                        |   2 -
 sycl/test/usm/memadvise.cpp                   |   1 +
 sycl/test/usm/memcpy.cpp                      |   3 +-
 sycl/test/usm/memset.cpp                      |   3 +-
 sycl/test/usm/mixed.cpp                       |   1 +
 sycl/test/usm/mixed2.cpp                      |   3 +-
 sycl/test/usm/mixed2template.cpp              |   3 +-
 sycl/test/usm/mixed_queue.cpp                 |   3 +-
 sycl/test/usm/pfor_flatten.cpp                |   4 +-
 sycl/test/usm/queue_wait.cpp                  |   3 +-
 sycl/test/usm/smemll.cpp                      |   1 +
 sycl/tools/CMakeLists.txt                     |  25 ++++
 sycl/tools/get_device_count_by_type.cpp       |   6 +-
 sycl/unittests/pi/cuda/CMakeLists.txt         |   1 +
 .../pi/cuda/test_primary_context.cpp          | 134 ++++++++++++++++++
 41 files changed, 212 insertions(+), 156 deletions(-)
 delete mode 100644 sycl/test/backend/cuda/primary_context.cpp
 create mode 100644 sycl/unittests/pi/cuda/test_primary_context.cpp
diff --git a/sycl/test/backend/cuda/primary_context.cpp b/sycl/test/backend/cuda/primary_context.cpp
deleted file mode 100644
index d02b4bca35f60..0000000000000
--- a/sycl/test/backend/cuda/primary_context.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-// REQUIRES: cuda
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I%opencl_include_dir -I%cuda_toolkit_include -o %t.out -lcuda -lsycl
-// RUN: env SYCL_DEVICE_TYPE=GPU %t.out
-// NOTE: OpenCL is required for the runtime, even when using the CUDA BE.
-
-//==---------- primary_context.cpp - SYCL cuda primary context test --------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include <CL/sycl.hpp>
-#include <CL/sycl/detail/pi_cuda.hpp>
-#include <cuda.h>
-#include <iostream>
-
-using namespace cl::sycl;
-
-void check(bool condition, const char *conditionString, const char *filename,
-           const long line) noexcept {
-  if (!condition) {
-    std::cerr << "CHECK failed in " << filename << "#" << line << " "
-              << conditionString << "\n";
-    std::abort();
-  }
-}
-
-#define CHECK(CONDITION) check(CONDITION, #CONDITION, __FILE__, __LINE__)
-
-bool isCudaDevice(const device &dev) {
-  const platform platform = dev.get_info<info::device::platform>();
-  const std::string platformVersion =
-      platform.get_info<info::platform::version>();
-  // If using PI_CUDA, don't accept a non-CUDA device
-  return platformVersion.find("CUDA") != std::string::npos;
-}
-
-class cuda_device_selector : public device_selector {
-public:
-  int operator()(const device &dev) const {
-    return isCudaDevice(dev) ? 1 : -1;
-  }
-};
-
-class other_cuda_device_selector : public device_selector {
-public:
-  other_cuda_device_selector(const device &dev) : excludeDevice{dev} {}
-
-  int operator()(const device &dev) const {
-    if (!isCudaDevice(dev)) {
-      return -1;
-    }
-    if (dev.get() == excludeDevice.get()) {
-      // Return only this device if it is the only available
-      return 0;
-    }
-    return 1;
-  }
-
-private:
-  const device &excludeDevice;
-};
-
-int main() {
-  try {
-    context c;
-  } catch (device_error &e) {
-    std::cout << "Failed to create device for context" << std::endl;
-  }
-
-  device DeviceA = cuda_device_selector().select_device();
-  device DeviceB = other_cuda_device_selector(DeviceA).select_device();
-
-  CHECK(isCudaDevice(DeviceA));
-
-  {
-    std::cout << "create single context" << std::endl;
-    context Context(DeviceA, async_handler{}, /*UsePrimaryContext=*/true);
-
-    CUdevice CudaDevice = reinterpret_cast<pi_device>(DeviceA.get())->get();
-    CUcontext CudaContext = reinterpret_cast<pi_context>(Context.get())->get();
-
-    CUcontext PrimaryCudaContext;
-    cuDevicePrimaryCtxRetain(&PrimaryCudaContext, CudaDevice);
-
-    CHECK(CudaContext == PrimaryCudaContext);
-
-    cuDevicePrimaryCtxRelease(CudaDevice);
-  }
-  {
-    std::cout << "create multiple contexts for one device" << std::endl;
-    context ContextA(DeviceA, async_handler{}, /*UsePrimaryContext=*/true);
-    context ContextB(DeviceA, async_handler{}, /*UsePrimaryContext=*/true);
-
-    CUcontext CudaContextA =
-        reinterpret_cast<pi_context>(ContextA.get())->get();
-    CUcontext CudaContextB =
-        reinterpret_cast<pi_context>(ContextB.get())->get();
-
-    CHECK(CudaContextA == CudaContextB);
-  }
-  if (isCudaDevice(DeviceB) && DeviceA.get() != DeviceB.get()) {
-    std::cout << "create multiple contexts for multiple devices" << std::endl;
-    context ContextA(DeviceA, async_handler{}, /*UsePrimaryContext=*/true);
-    context ContextB(DeviceB, async_handler{}, /*UsePrimaryContext=*/true);
-
-    CUcontext CudaContextA =
-        reinterpret_cast<pi_context>(ContextA.get())->get();
-    CUcontext CudaContextB =
-        reinterpret_cast<pi_context>(ContextB.get())->get();
-
-    CHECK(CudaContextA != CudaContextB);
-  }
-}
diff --git a/sycl/test/basic_tests/boolean.cpp b/sycl/test/basic_tests/boolean.cpp
index 041cf492786d4..cac65ddaa80bd 100644
--- a/sycl/test/basic_tests/boolean.cpp
+++ b/sycl/test/basic_tests/boolean.cpp
@@ -3,8 +3,6 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
-// XFAIL: cuda
-// TODO: investigate incorrect results on cuda backend
 #include <CL/sycl.hpp>
 
 #include <cassert>
diff --git a/sycl/test/basic_tests/buffer/buffer_full_copy.cpp b/sycl/test/basic_tests/buffer/buffer_full_copy.cpp
index f9c2047510e31..d4d64a7882a32 100644
--- a/sycl/test/basic_tests/buffer/buffer_full_copy.cpp
+++ b/sycl/test/basic_tests/buffer/buffer_full_copy.cpp
@@ -6,9 +6,6 @@
 // RUN: %GPU_RUN_PLACEHOLDER %t2.out
 // RUN: %ACC_RUN_PLACEHOLDER %t2.out
 
-// TODO: cuda_piEnqueueMemBufferCopy not implemented
-// XFAIL: cuda
-
 //==------------- buffer_full_copy.cpp - SYCL buffer basic test ------------==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
diff --git a/sycl/test/group-algorithm/all_of.cpp b/sycl/test/group-algorithm/all_of.cpp
index a8b4fc4bfff2b..5b0eabd8a8e1b 100644
--- a/sycl/test/group-algorithm/all_of.cpp
+++ b/sycl/test/group-algorithm/all_of.cpp
@@ -1,8 +1,9 @@
-// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda
 
 #include <CL/sycl.hpp>
 #include <algorithm>
diff --git a/sycl/test/group-algorithm/any_of.cpp b/sycl/test/group-algorithm/any_of.cpp
index 4e5391b5b01be..2c117d74121ea 100644
--- a/sycl/test/group-algorithm/any_of.cpp
+++ b/sycl/test/group-algorithm/any_of.cpp
@@ -1,8 +1,9 @@
-// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda
 
 #include <CL/sycl.hpp>
 #include <algorithm>
diff --git a/sycl/test/group-algorithm/broadcast.cpp b/sycl/test/group-algorithm/broadcast.cpp
index 9fcce3b938673..8315eee1c23f0 100644
--- a/sycl/test/group-algorithm/broadcast.cpp
+++ b/sycl/test/group-algorithm/broadcast.cpp
@@ -1,8 +1,9 @@
-// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda
 
 #include <CL/sycl.hpp>
 #include <algorithm>
diff --git a/sycl/test/group-algorithm/exclusive_scan.cpp b/sycl/test/group-algorithm/exclusive_scan.cpp
index fad4777a7cec1..69236c2858000 100644
--- a/sycl/test/group-algorithm/exclusive_scan.cpp
+++ b/sycl/test/group-algorithm/exclusive_scan.cpp
@@ -1,8 +1,9 @@
-// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda
 
 #include <CL/sycl.hpp>
 #include <algorithm>
diff --git a/sycl/test/group-algorithm/inclusive_scan.cpp b/sycl/test/group-algorithm/inclusive_scan.cpp
index 54d79f72e5395..8f4e4f701adea 100644
--- a/sycl/test/group-algorithm/inclusive_scan.cpp
+++ b/sycl/test/group-algorithm/inclusive_scan.cpp
@@ -1,8 +1,9 @@
-// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda
 
 #include <CL/sycl.hpp>
 #include <algorithm>
diff --git a/sycl/test/group-algorithm/leader.cpp b/sycl/test/group-algorithm/leader.cpp
index 3e0bad4706cfc..86132ce8ed54c 100644
--- a/sycl/test/group-algorithm/leader.cpp
+++ b/sycl/test/group-algorithm/leader.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
diff --git a/sycl/test/group-algorithm/none_of.cpp b/sycl/test/group-algorithm/none_of.cpp
index d0ef19b8ed3ea..10d94b1019ac4 100644
--- a/sycl/test/group-algorithm/none_of.cpp
+++ b/sycl/test/group-algorithm/none_of.cpp
@@ -1,8 +1,9 @@
-// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda
 
 #include <CL/sycl.hpp>
 #include <algorithm>
diff --git a/sycl/test/group-algorithm/reduce.cpp b/sycl/test/group-algorithm/reduce.cpp
index 988c40f245ff7..9f6dd05fba6d2 100644
--- a/sycl/test/group-algorithm/reduce.cpp
+++ b/sycl/test/group-algorithm/reduce.cpp
@@ -1,8 +1,9 @@
-// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda
 
 #include <CL/sycl.hpp>
 #include <algorithm>
diff --git a/sycl/test/hier_par/hier_par_basic.cpp b/sycl/test/hier_par/hier_par_basic.cpp
index 6caf3169f555f..d1a94ea1a7112 100644
--- a/sycl/test/hier_par/hier_par_basic.cpp
+++ b/sycl/test/hier_par/hier_par_basic.cpp
@@ -12,9 +12,6 @@
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// TODO: ptxas fatal   : Unresolved extern function '__spirv_ControlBarrier'
-// XFAIL: cuda
-
 // This test checks hierarchical parallelism invocation APIs, but without any
 // data or code with side-effects between the work group and work item scopes.
 
diff --git a/sycl/test/kernel_from_file/hw.cpp b/sycl/test/kernel_from_file/hw.cpp
index a9f2b6031d26d..3e53d6260e8cb 100644
--- a/sycl/test/kernel_from_file/hw.cpp
+++ b/sycl/test/kernel_from_file/hw.cpp
@@ -6,9 +6,6 @@
 
 // TODO: InvalidTargetTriple: Expects spir-unknown-unknown or spir64-unknown-unknown. Actual target triple is x86_64-unknown-linux-gnu
 
-// XFAIL: cuda
-// Currently unsupported on cuda as this test specifically tests a SPV path.
-
 #include <CL/sycl.hpp>
 #include <iostream>
 
diff --git a/sycl/test/lit.cfg.py b/sycl/test/lit.cfg.py
index ca2947308e75b..252c38a797c4d 100644
--- a/sycl/test/lit.cfg.py
+++ b/sycl/test/lit.cfg.py
@@ -155,6 +155,7 @@ def getDeviceCount(device_type):
     config.available_features.add('gpu')
     if cuda:
        config.available_features.add('cuda')
+       gpu_run_substitute += " SYCL_BE=PI_CUDA "
 
     if platform.system() == "Linux":
         gpu_run_on_linux_substitute = "env SYCL_DEVICE_TYPE=GPU "
diff --git a/sycl/test/multi_ptr/multi_ptr.cpp b/sycl/test/multi_ptr/multi_ptr.cpp
index a6e20a808ac69..c2e44f461e1b7 100644
--- a/sycl/test/multi_ptr/multi_ptr.cpp
+++ b/sycl/test/multi_ptr/multi_ptr.cpp
@@ -3,7 +3,7 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
-// RUN: %clangxx -DRESTRICT_WRITE_ACCESS_TO_CONSTANT_PTR -fsycl %s -o %t1.out
+// RUN: %clangxx -DRESTRICT_WRITE_ACCESS_TO_CONSTANT_PTR -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
diff --git a/sycl/test/program_manager/env_vars.cpp b/sycl/test/program_manager/env_vars.cpp
index e747eab855e12..dcc4bd4a38e3c 100644
--- a/sycl/test/program_manager/env_vars.cpp
+++ b/sycl/test/program_manager/env_vars.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -O0 -fsycl %s -o %t.out -lsycl
+// RUN: %clangxx -O0 -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -lsycl
 //
 // Deprecated SYCL_PROGRAM_BUILD_OPTIONS should work as an alias to
 // SYCL_PROGRAM_COMPILE_OPTIONS:
@@ -16,7 +16,6 @@
 // RUN: %CPU_RUN_PLACEHOLDER SYCL_PROGRAM_COMPILE_OPTIONS="-enable-link-options -cl-denorms-are-zero" SHOULD_CRASH=1 %t.out
 // RUN: %CPU_RUN_PLACEHOLDER SYCL_PROGRAM_LINK_OPTIONS="-g" SHOULD_CRASH=1 %t.out
 
-
 #include <CL/sycl.hpp>
 #include <cassert>
 #include <memory>
diff --git a/sycl/test/regression/private_array_init_test.cpp b/sycl/test/regression/private_array_init_test.cpp
index d85a610794b03..bf10d8e217e55 100644
--- a/sycl/test/regression/private_array_init_test.cpp
+++ b/sycl/test/regression/private_array_init_test.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl %s -o %t.out -lOpenCL
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -lOpenCL
 
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
diff --git a/sycl/test/regression/static-buffer-dtor.cpp b/sycl/test/regression/static-buffer-dtor.cpp
index ab0809034d733..3d8bd98fa3f18 100644
--- a/sycl/test/regression/static-buffer-dtor.cpp
+++ b/sycl/test/regression/static-buffer-dtor.cpp
@@ -16,7 +16,6 @@
 
 // TODO: terminate called after throwing an instance of 'cl::sycl::runtime_error'
 // TODO: what():  OpenCL API failed. OpenCL API returns: -999 (Unknown OpenCL error code) -999 (Unknown OpenCL error code)
-// XFAIL: cuda
 
 #include <CL/sycl.hpp>
 
diff --git a/sycl/test/scheduler/DataMovement.cpp b/sycl/test/scheduler/DataMovement.cpp
index f1812bf4dbf29..48282d34f0662 100644
--- a/sycl/test/scheduler/DataMovement.cpp
+++ b/sycl/test/scheduler/DataMovement.cpp
@@ -1,5 +1,9 @@
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -I %sycl_source_dir %s -o %t.out
-// RUN: %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// Incorrect event callback processing for host device.
+// XFAIL: cuda
 //
 //==-------------------------- DataMovement.cpp ----------------------------==//
 //
diff --git a/sycl/test/usm/allocator_vector.cpp b/sycl/test/usm/allocator_vector.cpp
index 9cc82d8fac1f5..3ae88da999aa9 100644
--- a/sycl/test/usm/allocator_vector.cpp
+++ b/sycl/test/usm/allocator_vector.cpp
@@ -2,6 +2,7 @@
 // RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
+// UNSUPPORTED: cuda
 
 //==---- allocator_vector.cpp - Allocator Container test -------------------==//
 //
diff --git a/sycl/test/usm/allocator_vector_fail.cpp b/sycl/test/usm/allocator_vector_fail.cpp
index bb033ef753071..7bb8025f2003a 100644
--- a/sycl/test/usm/allocator_vector_fail.cpp
+++ b/sycl/test/usm/allocator_vector_fail.cpp
@@ -2,6 +2,7 @@
 // RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
+// UNSUPPORTED: cuda
 
 //==-- allocator_vector_fail.cpp - Device Memory Allocator fail test -------==//
 //
diff --git a/sycl/test/usm/allocatorll.cpp b/sycl/test/usm/allocatorll.cpp
index 1b7796540686f..8fbfbe605f7df 100644
--- a/sycl/test/usm/allocatorll.cpp
+++ b/sycl/test/usm/allocatorll.cpp
@@ -2,6 +2,7 @@
 // RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
+// UNSUPPORTED: cuda
 
 //==---- allocatorll.cpp - Device Memory Linked List Allocator test --------==//
 //
diff --git a/sycl/test/usm/badmalloc.cpp b/sycl/test/usm/badmalloc.cpp
index fc91b1260d465..b574dd67eea3b 100644
--- a/sycl/test/usm/badmalloc.cpp
+++ b/sycl/test/usm/badmalloc.cpp
@@ -1,9 +1,9 @@
-// RUN: %clangxx -fsycl %s -o %t1.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
 
-// UNSUPPORTED: windows
+// UNSUPPORTED: windows,cuda
 
 //==----------------- badmalloc.cpp - Bad Mallocs test ---------------------==//
 //
diff --git a/sycl/test/usm/depends_on.cpp b/sycl/test/usm/depends_on.cpp
index 3943621853836..ea7749809f6f9 100644
--- a/sycl/test/usm/depends_on.cpp
+++ b/sycl/test/usm/depends_on.cpp
@@ -2,6 +2,7 @@
 // RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
+// UNSUPPORTED: cuda
 
 //==----------------- depends_on.cpp - depends_on test ---------------------==//
 //
diff --git a/sycl/test/usm/dmemll.cpp b/sycl/test/usm/dmemll.cpp
index e5c32b2f20262..de987904c69e1 100644
--- a/sycl/test/usm/dmemll.cpp
+++ b/sycl/test/usm/dmemll.cpp
@@ -2,6 +2,7 @@
 // RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
+// UNSUPPORTED: cuda
 
 //==------------------- dmemll.cpp - Device Memory Linked List test --------==//
 //
diff --git a/sycl/test/usm/hmemll.cpp b/sycl/test/usm/hmemll.cpp
index 38b578ce948c2..ce9898338d886 100644
--- a/sycl/test/usm/hmemll.cpp
+++ b/sycl/test/usm/hmemll.cpp
@@ -2,6 +2,7 @@
 // RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
+// UNSUPPORTED: cuda
 
 //==------------------- hmemll.cpp - Host Memory Linked List test ----------==//
 //
diff --git a/sycl/test/usm/math.cpp b/sycl/test/usm/math.cpp
index 4155767e309f7..c2ae25363fc2f 100644
--- a/sycl/test/usm/math.cpp
+++ b/sycl/test/usm/math.cpp
@@ -3,8 +3,6 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 
 // REQUIRES: cpu
-// TODO: ptxas fatal   : Unresolved extern function '_Z20__spirv_ocl_lgamma_rfPi'
-// XFAIL: cuda
 
 #include <CL/sycl.hpp>
 
diff --git a/sycl/test/usm/memadvise.cpp b/sycl/test/usm/memadvise.cpp
index 9b584c045e2e5..213b897f7d3ad 100644
--- a/sycl/test/usm/memadvise.cpp
+++ b/sycl/test/usm/memadvise.cpp
@@ -2,6 +2,7 @@
 // RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
+// UNSUPPORTED: cuda
 
 //==---------------- memadvise.cpp - Shared Memory Linked List test --------==//
 //
diff --git a/sycl/test/usm/memcpy.cpp b/sycl/test/usm/memcpy.cpp
index 3545cdf5218fd..a6ac08bf21da8 100644
--- a/sycl/test/usm/memcpy.cpp
+++ b/sycl/test/usm/memcpy.cpp
@@ -5,9 +5,10 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// RUN: %clangxx -fsycl %s -o %t1.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple  %s -o %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
+// UNSUPPORTED: cuda
 
 #include <CL/sycl.hpp>
 
diff --git a/sycl/test/usm/memset.cpp b/sycl/test/usm/memset.cpp
index 6fb12eb1fcc4d..9cc3aa8eb883e 100644
--- a/sycl/test/usm/memset.cpp
+++ b/sycl/test/usm/memset.cpp
@@ -1,6 +1,7 @@
-// RUN: %clangxx -fsycl %s -o %t1.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple  %s -o %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
+// UNSUPPORTED: cuda
 
 //==---- memset.cpp - USM memset test --------------------------------------==//
 //
diff --git a/sycl/test/usm/mixed.cpp b/sycl/test/usm/mixed.cpp
index 5d45182d2a35e..449db850b3791 100644
--- a/sycl/test/usm/mixed.cpp
+++ b/sycl/test/usm/mixed.cpp
@@ -2,6 +2,7 @@
 // RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
+// UNSUPPORTED: cuda
 
 //==------------------- mixed.cpp - Mixed Memory test ---------------------==//
 //
diff --git a/sycl/test/usm/mixed2.cpp b/sycl/test/usm/mixed2.cpp
index c074e2207b578..db0421f7f63fa 100644
--- a/sycl/test/usm/mixed2.cpp
+++ b/sycl/test/usm/mixed2.cpp
@@ -1,7 +1,8 @@
-// RUN: %clangxx -fsycl %s -o %t1.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple  %s -o %t1.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
+// UNSUPPORTED: cuda
 
 //==------------------- mixed2.cpp - Mixed Memory test ---------------------==//
 //
diff --git a/sycl/test/usm/mixed2template.cpp b/sycl/test/usm/mixed2template.cpp
index 4261187092d72..4bc544b0505a8 100644
--- a/sycl/test/usm/mixed2template.cpp
+++ b/sycl/test/usm/mixed2template.cpp
@@ -1,7 +1,8 @@
-// RUN: %clangxx -fsycl %s -o %t1.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple  %s -o %t1.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
+// UNSUPPORTED: cuda
 
 //==---------- mixed2template.cpp - Mixed Memory with Templatestest --------==//
 //
diff --git a/sycl/test/usm/mixed_queue.cpp b/sycl/test/usm/mixed_queue.cpp
index 0585e982179e1..10ea73d702856 100644
--- a/sycl/test/usm/mixed_queue.cpp
+++ b/sycl/test/usm/mixed_queue.cpp
@@ -1,7 +1,8 @@
-// RUN: %clangxx -fsycl %s -o %t1.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple  %s -o %t1.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
+// UNSUPPORTED: cuda
 
 //==-------------- mixed_queue.cpp - Mixed Memory test ---------------------==//
 //
diff --git a/sycl/test/usm/pfor_flatten.cpp b/sycl/test/usm/pfor_flatten.cpp
index 68496c7b94886..86089ad5a76c8 100644
--- a/sycl/test/usm/pfor_flatten.cpp
+++ b/sycl/test/usm/pfor_flatten.cpp
@@ -1,8 +1,8 @@
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda %s -o %t1.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple  -fsycl-unnamed-lambda %s -o %t1.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
-// XFAIL: cuda
+// UNSUPPORTED: cuda
 
 //==--------------- pfor_flatten.cpp - Kernel Launch Flattening test -------==//
 //
diff --git a/sycl/test/usm/queue_wait.cpp b/sycl/test/usm/queue_wait.cpp
index 76bdaaf4c7b92..5d187515c63ab 100644
--- a/sycl/test/usm/queue_wait.cpp
+++ b/sycl/test/usm/queue_wait.cpp
@@ -1,7 +1,8 @@
-// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple  %s -o %t.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda
 
 #include <CL/sycl.hpp>
 
diff --git a/sycl/test/usm/smemll.cpp b/sycl/test/usm/smemll.cpp
index 4fb79cb8429d8..7d602fdce9306 100644
--- a/sycl/test/usm/smemll.cpp
+++ b/sycl/test/usm/smemll.cpp
@@ -2,6 +2,7 @@
 // RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
+// UNSUPPORTED: cuda
 
 //==------------------- smemll.cpp - Shared Memory Linked List test --------==//
 //
diff --git a/sycl/tools/CMakeLists.txt b/sycl/tools/CMakeLists.txt
index dd921f969c66b..467801433c25c 100644
--- a/sycl/tools/CMakeLists.txt
+++ b/sycl/tools/CMakeLists.txt
@@ -4,6 +4,31 @@ set(CMAKE_CXX_EXTENSIONS OFF)
 
 add_executable(get_device_count_by_type get_device_count_by_type.cpp)
 add_dependencies(get_device_count_by_type ocl-headers ocl-icd)
+
+if( SYCL_BUILD_PI_CUDA )
+  find_package(CUDA 10.0 REQUIRED)
+
+  add_library(cudadrv SHARED IMPORTED)
+
+  set_target_properties(
+    cudadrv PROPERTIES
+      IMPORTED_LOCATION             ${CUDA_CUDA_LIBRARY}
+      INTERFACE_INCLUDE_DIRECTORIES ${CUDA_INCLUDE_DIRS}
+  )
+
+  target_compile_definitions(get_device_count_by_type
+      PUBLIC $<$<BOOL:${SYCL_BUILD_PI_CUDA}>:USE_PI_CUDA>
+  )
+
+  target_include_directories( get_device_count_by_type
+    PUBLIC
+      ${CUDA_INCLUDE_DIRS}
+  )
+  target_link_libraries(get_device_count_by_type
+    PUBLIC OpenCL-Headers cudadrv
+)
+endif()
+
 target_link_libraries(get_device_count_by_type
     PRIVATE OpenCL::Headers
     PRIVATE ${OpenCL_LIBRARIES}
diff --git a/sycl/tools/get_device_count_by_type.cpp b/sycl/tools/get_device_count_by_type.cpp
index 5611685889fac..ca66310958dc2 100644
--- a/sycl/tools/get_device_count_by_type.cpp
+++ b/sycl/tools/get_device_count_by_type.cpp
@@ -10,7 +10,7 @@
 #include <CL/cl_ext.h>
 
 #ifdef USE_PI_CUDA
-#include <cuda_driver.h>
+#include <cuda.h>
 #endif  // USE_PI_CUDA
 
 #include <iostream>
@@ -38,12 +38,12 @@ int main(int argc, char* argv[]) {
     cl_uint deviceCount = 0;
 
 #ifdef USE_PI_CUDA
-    if (backend == "CUDA") {
+    if (backend == "PI_CUDA") {
       std::string msg{""};
 
       int runtime_version = 0;
 
-      cudaError_t err = cuDriverGetVersion(&runtime_version);
+      auto err = cuDriverGetVersion(&runtime_version);
       if (runtime_version < 9020 || err != CUDA_SUCCESS) {
         std::cout << deviceCount << " :Unsupported CUDA Runtime " << std::endl;
       }
diff --git a/sycl/unittests/pi/cuda/CMakeLists.txt b/sycl/unittests/pi/cuda/CMakeLists.txt
index 0d68616bc5d5d..e0084af403cc3 100644
--- a/sycl/unittests/pi/cuda/CMakeLists.txt
+++ b/sycl/unittests/pi/cuda/CMakeLists.txt
@@ -5,6 +5,7 @@ add_sycl_unittest(PiCudaTests
   test_device.cpp
   test_kernels.cpp
   test_mem_obj.cpp
+  test_primary_context.cpp
   test_queue.cpp
   test_events.cpp
 )
diff --git a/sycl/unittests/pi/cuda/test_primary_context.cpp b/sycl/unittests/pi/cuda/test_primary_context.cpp
new file mode 100644
index 0000000000000..199765debeeed
--- /dev/null
+++ b/sycl/unittests/pi/cuda/test_primary_context.cpp
@@ -0,0 +1,134 @@
+//==---------- pi_primary_context.cpp - PI unit tests ----------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "gtest/gtest.h"
+
+#include <cuda.h>
+
+#include <CL/sycl.hpp>
+#include <CL/sycl/backend/cuda.hpp>
+#include <CL/sycl/detail/pi.hpp>
+#include <detail/plugin.hpp>
+#include <pi_cuda.hpp>
+
+#include <iostream>
+
+using namespace cl::sycl;
+
+void check(bool condition, const char *conditionString, const char *filename,
+           const long line) noexcept {
+  if (!condition) {
+    std::cerr << "CHECK failed in " << filename << "#" << line << " "
+              << conditionString << "\n";
+    std::abort();
+  }
+}
+
+#define CHECK(CONDITION) check(CONDITION, #CONDITION, __FILE__, __LINE__)
+
+bool isCudaDevice(const device &dev) {
+  const platform platform = dev.get_info<info::device::platform>();
+  const std::string platformVersion =
+      platform.get_info<info::platform::version>();
+  // If using PI_CUDA, don't accept a non-CUDA device
+  return platformVersion.find("CUDA") != std::string::npos;
+}
+
+class cuda_device_selector : public device_selector {
+public:
+  int operator()(const device &dev) const { return isCudaDevice(dev) ? 1 : -1; }
+};
+
+class other_cuda_device_selector : public device_selector {
+public:
+  other_cuda_device_selector(const device &dev) : excludeDevice{dev} {}
+
+  int operator()(const device &dev) const {
+    if (!isCudaDevice(dev)) {
+      return -1;
+    }
+    if (dev.get() == excludeDevice.get()) {
+      // Return only this device if it is the only available
+      return 0;
+    }
+    return 1;
+  }
+
+private:
+  const device &excludeDevice;
+};
+
+using namespace cl::sycl;
+
+struct DISABLED_CudaPrimaryContextTests : public ::testing::Test {
+
+protected:
+  std::vector<detail::plugin> Plugins;
+
+  pi_platform platform_;
+  device deviceA_;
+  device deviceB_;
+  context context_;
+
+  void SetUp() override {
+
+    try {
+      context context_;
+    } catch (device_error &e) {
+      std::cout << "Failed to create device for context" << std::endl;
+    }
+
+    deviceA_ = cuda_device_selector().select_device();
+    deviceB_ = other_cuda_device_selector(deviceA_).select_device();
+
+    ASSERT_TRUE(isCudaDevice(deviceA_));
+  }
+
+  void TearDown() override {}
+};
+
+TEST_F(DISABLED_CudaPrimaryContextTests, piSingleContext) {
+  std::cout << "create single context" << std::endl;
+  context Context(deviceA_, async_handler{}, /*UsePrimaryContext=*/true);
+
+  CUdevice CudaDevice = reinterpret_cast<pi_device>(deviceA_.get())->get();
+  CUcontext CudaContext = reinterpret_cast<pi_context>(Context.get())->get();
+
+  CUcontext PrimaryCudaContext;
+  cuDevicePrimaryCtxRetain(&PrimaryCudaContext, CudaDevice);
+
+  ASSERT_EQ(CudaContext, PrimaryCudaContext);
+
+  cuDevicePrimaryCtxRelease(CudaDevice);
+}
+
+TEST_F(DISABLED_CudaPrimaryContextTests, piMultiContextSingleDevice) {
+  std::cout << "create multiple contexts for one device" << std::endl;
+  context ContextA(deviceA_, async_handler{}, /*UsePrimaryContext=*/true);
+  context ContextB(deviceA_, async_handler{}, /*UsePrimaryContext=*/true);
+
+  CUcontext CudaContextA = reinterpret_cast<pi_context>(ContextA.get())->get();
+  CUcontext CudaContextB = reinterpret_cast<pi_context>(ContextB.get())->get();
+
+  ASSERT_EQ(CudaContextA, CudaContextB);
+}
+
+TEST_F(DISABLED_CudaPrimaryContextTests, piMultiContextMultiDevice) {
+  if (isCudaDevice(deviceB_) && deviceA_.get() != deviceB_.get()) {
+    std::cout << "create multiple contexts for multiple devices" << std::endl;
+    context ContextA(deviceA_, async_handler{}, /*UsePrimaryContext=*/true);
+    context ContextB(deviceB_, async_handler{}, /*UsePrimaryContext=*/true);
+
+    CUcontext CudaContextA =
+        reinterpret_cast<pi_context>(ContextA.get())->get();
+    CUcontext CudaContextB =
+        reinterpret_cast<pi_context>(ContextB.get())->get();
+
+    ASSERT_NE(CudaContextA, CudaContextB);
+  }
+}