From c938a3da5b081821f99dd56ed85845c883956fd9 Mon Sep 17 00:00:00 2001
From: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
Date: Thu, 25 Feb 2021 20:02:34 -0800
Subject: [PATCH 1/4] [SYCL] Add test cases for
 muptiplies,bit_or,bit_xor,bit_and subgroup algorithms

These new test cases verify https://github.com/intel/llvm/pull/3267

Signed-off-by: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
---
 SYCL/SubGroup/reduce.hpp         | 16 +++++++-
 SYCL/SubGroup/reduce_bit_ops.cpp | 64 +++++++++++++++++++++++++++++++
 SYCL/SubGroup/scan.hpp           | 16 +++++++-
 SYCL/SubGroup/scan_bit_ops.cpp   | 65 ++++++++++++++++++++++++++++++++
 4 files changed, 157 insertions(+), 4 deletions(-)
 create mode 100644 SYCL/SubGroup/reduce_bit_ops.cpp
 create mode 100644 SYCL/SubGroup/scan_bit_ops.cpp
diff --git a/SYCL/SubGroup/reduce.hpp b/SYCL/SubGroup/reduce.hpp
index f606dcf5e9..adb77041c0 100644
--- a/SYCL/SubGroup/reduce.hpp
+++ b/SYCL/SubGroup/reduce.hpp
@@ -88,7 +88,7 @@ void check(queue &Queue, size_t G = 256, size_t L = 64) {
   check_op<sycl_subgr<SpecializationKernelName, class KernelName_bPPlfvdGShi>,
            T>(Queue, T(0), ONEAPI::maximum<T>(), true, G, L);
 
-#if __cplusplus >= 201402L
+  // Transparent operator functors.
   check_op<sycl_subgr<SpecializationKernelName,
                       class KernelName_fkOyLRYirfMnvBcnbRFy>,
            T>(Queue, T(L), ONEAPI::plus<>(), false, G, L);
@@ -107,5 +107,17 @@ void check(queue &Queue, size_t G = 256, size_t L = 64) {
   check_op<
       sycl_subgr<SpecializationKernelName, class KernelName_BaCGaWDMFeMFqvotbk>,
       T>(Queue, T(0), ONEAPI::maximum<>(), true, G, L);
-#endif
+
+  // Use small sub-groups to avoid overflow effects for int multiply operations
+  // and avoid rounding issues for FP multiply.
+  L = 4;
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_MulF>, T>(
+      Queue, T(G), ONEAPI::multiplies<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_MulT>, T>(
+      Queue, T(1), ONEAPI::multiplies<T>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_MulFV>, T>(
+      Queue, T(G), ONEAPI::multiplies<>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_MulTV>, T>(
+      Queue, T(1), ONEAPI::multiplies<>(), true, G, L);
 }
diff --git a/SYCL/SubGroup/reduce_bit_ops.cpp b/SYCL/SubGroup/reduce_bit_ops.cpp
new file mode 100644
index 0000000000..04ef923683
--- /dev/null
+++ b/SYCL/SubGroup/reduce_bit_ops.cpp
@@ -0,0 +1,64 @@
+// UNSUPPORTED: cpu
+// #2252 Disable until all variants of built-ins are available in OpenCL CPU
+// runtime for every supported ISA
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test verifies correct handling of reduce() algorithm used with
+// integer bitwise OR, XOR, AND operations.
+
+#include "reduce.hpp"
+
+template <typename SpecializationKernelName, typename T>
+void check_bit_ops(queue &Queue, size_t G = 256, size_t L = 4) {
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORF>, T>(
+      Queue, T(G), ONEAPI::bit_or<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORT>, T>(
+      Queue, T(0), ONEAPI::bit_or<T>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORF>, T>(
+      Queue, T(G), ONEAPI::bit_xor<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORT>, T>(
+      Queue, T(0), ONEAPI::bit_xor<T>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDF>, T>(
+      Queue, T(G), ONEAPI::bit_and<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDT>, T>(
+      Queue, ~T(0), ONEAPI::bit_and<T>(), true, G, L);
+
+  // Transparent operator functors
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORFV>, T>(
+      Queue, T(G), ONEAPI::bit_or<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORTV>, T>(
+      Queue, T(0), ONEAPI::bit_or<T>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORFV>, T>(
+      Queue, T(G), ONEAPI::bit_xor<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORTV>, T>(
+      Queue, T(0), ONEAPI::bit_xor<T>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDFV>, T>(
+      Queue, T(G), ONEAPI::bit_and<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDTV>, T>(
+      Queue, ~T(0), ONEAPI::bit_and<T>(), true, G, L);
+}
+
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  check_bit_ops<class A, int>(Queue);
+  check_bit_ops<class B, unsigned int>(Queue);
+  check_bit_ops<class C, unsigned>(Queue);
+  check_bit_ops<class D, long>(Queue);
+  check_bit_ops<class E, unsigned long>(Queue);
+  check_bit_ops<class F, long long>(Queue);
+  check_bit_ops<class G, unsigned long long>(Queue);
+  return 0;
+}
diff --git a/SYCL/SubGroup/scan.hpp b/SYCL/SubGroup/scan.hpp
index c9d630dea2..ee944079db 100644
--- a/SYCL/SubGroup/scan.hpp
+++ b/SYCL/SubGroup/scan.hpp
@@ -115,7 +115,7 @@ void check(queue &Queue, size_t G = 256, size_t L = 64) {
         Queue, std::numeric_limits<T>::min(), ONEAPI::maximum<T>(), true, G, L);
   }
 
-#if __cplusplus >= 201402L
+  // Transparent operator functors.
   check_op<sycl_subgr<SpecializationKernelName, class KernelName_TPWS>, T>(
       Queue, T(L), ONEAPI::plus<>(), false, G, L);
   check_op<sycl_subgr<SpecializationKernelName, class KernelName_hWZv>, T>(
@@ -150,5 +150,17 @@ void check(queue &Queue, size_t G = 256, size_t L = 64) {
         T>(Queue, std::numeric_limits<T>::min(), ONEAPI::maximum<>(), true, G,
            L);
   }
-#endif
+
+  // Use small sub-groups to avoid overflow effects for int multiply operations
+  // and avoid rounding issues for FP multiply.
+  L = 4;
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_MulF>, T>(
+      Queue, T(L), ONEAPI::multiplies<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_MulT>, T>(
+      Queue, T(1), ONEAPI::multiplies<>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_MulFV>, T>(
+      Queue, T(L), ONEAPI::multiplies<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_MulTV>, T>(
+      Queue, T(1), ONEAPI::multiplies<>(), true, G, L);
 }
diff --git a/SYCL/SubGroup/scan_bit_ops.cpp b/SYCL/SubGroup/scan_bit_ops.cpp
new file mode 100644
index 0000000000..317edc0cd3
--- /dev/null
+++ b/SYCL/SubGroup/scan_bit_ops.cpp
@@ -0,0 +1,65 @@
+// UNSUPPORTED: cpu
+// #2252 Disable until all variants of built-ins are available in OpenCL CPU
+// runtime for every supported ISA
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test verifies correct handling of exclusive_scan and inclusive_scan
+// sub-group algorithm used with integer bitwise OR, XOR, AND operations.
+
+#include "scan.hpp"
+
+template <typename SpecializationKernelName, typename T>
+void check_bit_ops(queue &Queue, size_t G = 256, size_t L = 4) {
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORF>, T>(
+      Queue, T(L), ONEAPI::bit_or<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORT>, T>(
+      Queue, T(0), ONEAPI::bit_or<T>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORF>, T>(
+      Queue, T(L), ONEAPI::bit_xor<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORT>, T>(
+      Queue, T(0), ONEAPI::bit_xor<T>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDF>, T>(
+      Queue, T(L), ONEAPI::bit_and<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDT>, T>(
+      Queue, ~T(0), ONEAPI::bit_and<T>(), true, G, L);
+
+  // Transparent operator functors.
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORFV>, T>(
+      Queue, T(L), ONEAPI::bit_or<>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORTV>, T>(
+      Queue, T(0), ONEAPI::bit_or<>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORFV>, T>(
+      Queue, T(L), ONEAPI::bit_xor<>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORTV>, T>(
+      Queue, T(0), ONEAPI::bit_xor<>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDFV>, T>(
+      Queue, T(L), ONEAPI::bit_and<>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDTV>, T>(
+      Queue, ~T(0), ONEAPI::bit_and<>(), true, G, L);
+}
+
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  check_bit_ops<class A, int>(Queue);
+  check_bit_ops<class B, unsigned int>(Queue);
+  check_bit_ops<class C, unsigned>(Queue);
+  check_bit_ops<class D, long>(Queue);
+  check_bit_ops<class E, unsigned long>(Queue);
+  check_bit_ops<class F, long long>(Queue);
+  check_bit_ops<class G, unsigned long long>(Queue);
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}

From c3f7aeb251a3a24b9071bb8a65aae14fceb90770 Mon Sep 17 00:00:00 2001
From: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
Date: Mon, 1 Mar 2021 13:00:42 -0800
Subject: [PATCH 2/4] Additional fixes for CUDA: move spir-v 1.3 test cases to
 separate files

Signed-off-by: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
---
 SYCL/SubGroup/reduce.hpp              | 41 +++++++++++++++--
 SYCL/SubGroup/reduce_bit_ops.cpp      | 64 --------------------------
 SYCL/SubGroup/reduce_spirv13.cpp      | 39 ++++++++++++++++
 SYCL/SubGroup/reduce_spirv13_fp16.cpp | 20 +++++++++
 SYCL/SubGroup/reduce_spirv13_fp64.cpp | 27 +++++++++++
 SYCL/SubGroup/scan.hpp                | 40 +++++++++++++++--
 SYCL/SubGroup/scan_bit_ops.cpp        | 65 ---------------------------
 SYCL/SubGroup/scan_spirv13.cpp        | 39 ++++++++++++++++
 SYCL/SubGroup/scan_spirv13_fp16.cpp   | 20 +++++++++
 SYCL/SubGroup/scan_spirv13_fp64.cpp   | 27 +++++++++++
 10 files changed, 247 insertions(+), 135 deletions(-)
 delete mode 100644 SYCL/SubGroup/reduce_bit_ops.cpp
 create mode 100644 SYCL/SubGroup/reduce_spirv13.cpp
 create mode 100644 SYCL/SubGroup/reduce_spirv13_fp16.cpp
 create mode 100644 SYCL/SubGroup/reduce_spirv13_fp64.cpp
 delete mode 100644 SYCL/SubGroup/scan_bit_ops.cpp
 create mode 100644 SYCL/SubGroup/scan_spirv13.cpp
 create mode 100644 SYCL/SubGroup/scan_spirv13_fp16.cpp
 create mode 100644 SYCL/SubGroup/scan_spirv13_fp64.cpp

diff --git a/SYCL/SubGroup/reduce.hpp b/SYCL/SubGroup/reduce.hpp
index adb77041c0..4d181fe140 100644
--- a/SYCL/SubGroup/reduce.hpp
+++ b/SYCL/SubGroup/reduce.hpp
@@ -107,17 +107,52 @@ void check(queue &Queue, size_t G = 256, size_t L = 64) {
   check_op<
       sycl_subgr<SpecializationKernelName, class KernelName_BaCGaWDMFeMFqvotbk>,
       T>(Queue, T(0), ONEAPI::maximum<>(), true, G, L);
+}
 
-  // Use small sub-groups to avoid overflow effects for int multiply operations
-  // and avoid rounding issues for FP multiply.
-  L = 4;
+template <typename SpecializationKernelName, typename T>
+void check_mul(queue &Queue, size_t G = 256, size_t L = 4) {
   check_op<sycl_subgr<SpecializationKernelName, class KernelName_MulF>, T>(
       Queue, T(G), ONEAPI::multiplies<T>(), false, G, L);
   check_op<sycl_subgr<SpecializationKernelName, class KernelName_MulT>, T>(
       Queue, T(1), ONEAPI::multiplies<T>(), true, G, L);
 
+  // Transparent operator functors.
   check_op<sycl_subgr<SpecializationKernelName, class KernelName_MulFV>, T>(
       Queue, T(G), ONEAPI::multiplies<>(), false, G, L);
   check_op<sycl_subgr<SpecializationKernelName, class KernelName_MulTV>, T>(
       Queue, T(1), ONEAPI::multiplies<>(), true, G, L);
 }
+
+template <typename SpecializationKernelName, typename T>
+void check_bit_ops(queue &Queue, size_t G = 256, size_t L = 4) {
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORF>, T>(
+      Queue, T(G), ONEAPI::bit_or<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORT>, T>(
+      Queue, T(0), ONEAPI::bit_or<T>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORF>, T>(
+      Queue, T(G), ONEAPI::bit_xor<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORT>, T>(
+      Queue, T(0), ONEAPI::bit_xor<T>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDF>, T>(
+      Queue, T(G), ONEAPI::bit_and<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDT>, T>(
+      Queue, ~T(0), ONEAPI::bit_and<T>(), true, G, L);
+
+  // Transparent operator functors
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORFV>, T>(
+      Queue, T(G), ONEAPI::bit_or<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORTV>, T>(
+      Queue, T(0), ONEAPI::bit_or<T>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORFV>, T>(
+      Queue, T(G), ONEAPI::bit_xor<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORTV>, T>(
+      Queue, T(0), ONEAPI::bit_xor<T>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDFV>, T>(
+      Queue, T(G), ONEAPI::bit_and<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDTV>, T>(
+      Queue, ~T(0), ONEAPI::bit_and<T>(), true, G, L);
+}
diff --git a/SYCL/SubGroup/reduce_bit_ops.cpp b/SYCL/SubGroup/reduce_bit_ops.cpp
deleted file mode 100644
index 04ef923683..0000000000
--- a/SYCL/SubGroup/reduce_bit_ops.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-// UNSUPPORTED: cpu
-// #2252 Disable until all variants of built-ins are available in OpenCL CPU
-// runtime for every supported ISA
-//
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
-
-// This test verifies correct handling of reduce() algorithm used with
-// integer bitwise OR, XOR, AND operations.
-
-#include "reduce.hpp"
-
-template <typename SpecializationKernelName, typename T>
-void check_bit_ops(queue &Queue, size_t G = 256, size_t L = 4) {
-  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORF>, T>(
-      Queue, T(G), ONEAPI::bit_or<T>(), false, G, L);
-  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORT>, T>(
-      Queue, T(0), ONEAPI::bit_or<T>(), true, G, L);
-
-  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORF>, T>(
-      Queue, T(G), ONEAPI::bit_xor<T>(), false, G, L);
-  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORT>, T>(
-      Queue, T(0), ONEAPI::bit_xor<T>(), true, G, L);
-
-  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDF>, T>(
-      Queue, T(G), ONEAPI::bit_and<T>(), false, G, L);
-  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDT>, T>(
-      Queue, ~T(0), ONEAPI::bit_and<T>(), true, G, L);
-
-  // Transparent operator functors
-  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORFV>, T>(
-      Queue, T(G), ONEAPI::bit_or<T>(), false, G, L);
-  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORTV>, T>(
-      Queue, T(0), ONEAPI::bit_or<T>(), true, G, L);
-
-  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORFV>, T>(
-      Queue, T(G), ONEAPI::bit_xor<T>(), false, G, L);
-  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORTV>, T>(
-      Queue, T(0), ONEAPI::bit_xor<T>(), true, G, L);
-
-  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDFV>, T>(
-      Queue, T(G), ONEAPI::bit_and<T>(), false, G, L);
-  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDTV>, T>(
-      Queue, ~T(0), ONEAPI::bit_and<T>(), true, G, L);
-}
-
-int main() {
-  queue Queue;
-  if (!core_sg_supported(Queue.get_device())) {
-    std::cout << "Skipping test\n";
-    return 0;
-  }
-  check_bit_ops<class A, int>(Queue);
-  check_bit_ops<class B, unsigned int>(Queue);
-  check_bit_ops<class C, unsigned>(Queue);
-  check_bit_ops<class D, long>(Queue);
-  check_bit_ops<class E, unsigned long>(Queue);
-  check_bit_ops<class F, long long>(Queue);
-  check_bit_ops<class G, unsigned long long>(Queue);
-  return 0;
-}
diff --git a/SYCL/SubGroup/reduce_spirv13.cpp b/SYCL/SubGroup/reduce_spirv13.cpp
new file mode 100644
index 0000000000..82ff043ccf
--- /dev/null
+++ b/SYCL/SubGroup/reduce_spirv13.cpp
@@ -0,0 +1,39 @@
+// UNSUPPORTED: cpu
+// #2252 Disable until all variants of built-ins are available in OpenCL CPU
+// runtime for every supported ISA
+
+// UNSUPPORTED: cuda
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test verifies the correct work of SPIR-V 1.3 reduce algorithm
+// used with the operation MUL, bitwise OR, XOR, AND.
+
+#include "reduce.hpp"
+
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  check_mul<class MulA, int>(Queue);
+  check_mul<class MulB, unsigned int>(Queue);
+  check_mul<class MulC, long>(Queue);
+  check_mul<class MulD, unsigned long>(Queue);
+  check_mul<class MulE, float>(Queue);
+
+  check_bit_ops<class A, int>(Queue);
+  check_bit_ops<class B, unsigned int>(Queue);
+  check_bit_ops<class C, unsigned>(Queue);
+  check_bit_ops<class D, long>(Queue);
+  check_bit_ops<class E, unsigned long>(Queue);
+  check_bit_ops<class F, long long>(Queue);
+  check_bit_ops<class G, unsigned long long>(Queue);
+  return 0;
+}
diff --git a/SYCL/SubGroup/reduce_spirv13_fp16.cpp b/SYCL/SubGroup/reduce_spirv13_fp16.cpp
new file mode 100644
index 0000000000..6bce6cad09
--- /dev/null
+++ b/SYCL/SubGroup/reduce_spirv13_fp16.cpp
@@ -0,0 +1,20 @@
+// UNSUPPORTED: cuda
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// This test verifies the correct work of SPIR-V 1.3 reduce algorithm
+// used with MUL operation.
+
+#include "reduce.hpp"
+
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  check_mul<class MulHalf, cl::sycl::half>(Queue);
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
diff --git a/SYCL/SubGroup/reduce_spirv13_fp64.cpp b/SYCL/SubGroup/reduce_spirv13_fp64.cpp
new file mode 100644
index 0000000000..9c4591cefe
--- /dev/null
+++ b/SYCL/SubGroup/reduce_spirv13_fp64.cpp
@@ -0,0 +1,27 @@
+// UNSUPPORTED: cpu
+// #2252 Disable until all variants of built-ins are available in OpenCL CPU
+// runtime for every supported ISA
+
+// UNSUPPORTED: cuda
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test verifies the correct work of SPIR-V 1.3 reduce algorithm
+// used with MUL operation.
+
+#include "reduce.hpp"
+
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  check_mul<class MulDouble, double>(Queue);
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
diff --git a/SYCL/SubGroup/scan.hpp b/SYCL/SubGroup/scan.hpp
index ee944079db..3597eb2ba5 100644
--- a/SYCL/SubGroup/scan.hpp
+++ b/SYCL/SubGroup/scan.hpp
@@ -150,10 +150,10 @@ void check(queue &Queue, size_t G = 256, size_t L = 64) {
         T>(Queue, std::numeric_limits<T>::min(), ONEAPI::maximum<>(), true, G,
            L);
   }
+}
 
-  // Use small sub-groups to avoid overflow effects for int multiply operations
-  // and avoid rounding issues for FP multiply.
-  L = 4;
+template <typename SpecializationKernelName, typename T>
+void check_mul(queue &Queue, size_t G = 256, size_t L = 4) {
   check_op<sycl_subgr<SpecializationKernelName, class KernelName_MulF>, T>(
       Queue, T(L), ONEAPI::multiplies<T>(), false, G, L);
   check_op<sycl_subgr<SpecializationKernelName, class KernelName_MulT>, T>(
@@ -164,3 +164,37 @@ void check(queue &Queue, size_t G = 256, size_t L = 64) {
   check_op<sycl_subgr<SpecializationKernelName, class KernelName_MulTV>, T>(
       Queue, T(1), ONEAPI::multiplies<>(), true, G, L);
 }
+
+template <typename SpecializationKernelName, typename T>
+void check_bit_ops(queue &Queue, size_t G = 256, size_t L = 4) {
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORF>, T>(
+      Queue, T(L), ONEAPI::bit_or<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORT>, T>(
+      Queue, T(0), ONEAPI::bit_or<T>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORF>, T>(
+      Queue, T(L), ONEAPI::bit_xor<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORT>, T>(
+      Queue, T(0), ONEAPI::bit_xor<T>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDF>, T>(
+      Queue, T(L), ONEAPI::bit_and<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDT>, T>(
+      Queue, ~T(0), ONEAPI::bit_and<T>(), true, G, L);
+
+  // Transparent operator functors.
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORFV>, T>(
+      Queue, T(L), ONEAPI::bit_or<>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORTV>, T>(
+      Queue, T(0), ONEAPI::bit_or<>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORFV>, T>(
+      Queue, T(L), ONEAPI::bit_xor<>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORTV>, T>(
+      Queue, T(0), ONEAPI::bit_xor<>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDFV>, T>(
+      Queue, T(L), ONEAPI::bit_and<>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDTV>, T>(
+      Queue, ~T(0), ONEAPI::bit_and<>(), true, G, L);
+}
diff --git a/SYCL/SubGroup/scan_bit_ops.cpp b/SYCL/SubGroup/scan_bit_ops.cpp
deleted file mode 100644
index 317edc0cd3..0000000000
--- a/SYCL/SubGroup/scan_bit_ops.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-// UNSUPPORTED: cpu
-// #2252 Disable until all variants of built-ins are available in OpenCL CPU
-// runtime for every supported ISA
-//
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
-
-// This test verifies correct handling of exclusive_scan and inclusive_scan
-// sub-group algorithm used with integer bitwise OR, XOR, AND operations.
-
-#include "scan.hpp"
-
-template <typename SpecializationKernelName, typename T>
-void check_bit_ops(queue &Queue, size_t G = 256, size_t L = 4) {
-  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORF>, T>(
-      Queue, T(L), ONEAPI::bit_or<T>(), false, G, L);
-  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORT>, T>(
-      Queue, T(0), ONEAPI::bit_or<T>(), true, G, L);
-
-  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORF>, T>(
-      Queue, T(L), ONEAPI::bit_xor<T>(), false, G, L);
-  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORT>, T>(
-      Queue, T(0), ONEAPI::bit_xor<T>(), true, G, L);
-
-  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDF>, T>(
-      Queue, T(L), ONEAPI::bit_and<T>(), false, G, L);
-  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDT>, T>(
-      Queue, ~T(0), ONEAPI::bit_and<T>(), true, G, L);
-
-  // Transparent operator functors.
-  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORFV>, T>(
-      Queue, T(L), ONEAPI::bit_or<>(), false, G, L);
-  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORTV>, T>(
-      Queue, T(0), ONEAPI::bit_or<>(), true, G, L);
-
-  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORFV>, T>(
-      Queue, T(L), ONEAPI::bit_xor<>(), false, G, L);
-  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORTV>, T>(
-      Queue, T(0), ONEAPI::bit_xor<>(), true, G, L);
-
-  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDFV>, T>(
-      Queue, T(L), ONEAPI::bit_and<>(), false, G, L);
-  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDTV>, T>(
-      Queue, ~T(0), ONEAPI::bit_and<>(), true, G, L);
-}
-
-int main() {
-  queue Queue;
-  if (!core_sg_supported(Queue.get_device())) {
-    std::cout << "Skipping test\n";
-    return 0;
-  }
-  check_bit_ops<class A, int>(Queue);
-  check_bit_ops<class B, unsigned int>(Queue);
-  check_bit_ops<class C, unsigned>(Queue);
-  check_bit_ops<class D, long>(Queue);
-  check_bit_ops<class E, unsigned long>(Queue);
-  check_bit_ops<class F, long long>(Queue);
-  check_bit_ops<class G, unsigned long long>(Queue);
-  std::cout << "Test passed." << std::endl;
-  return 0;
-}
diff --git a/SYCL/SubGroup/scan_spirv13.cpp b/SYCL/SubGroup/scan_spirv13.cpp
new file mode 100644
index 0000000000..36e484754e
--- /dev/null
+++ b/SYCL/SubGroup/scan_spirv13.cpp
@@ -0,0 +1,39 @@
+// UNSUPPORTED: cpu
+// #2252 Disable until all variants of built-ins are available in OpenCL CPU
+// runtime for every supported ISA
+
+// UNSUPPORTED: cuda
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test verifies the correct work of SPIR-V 1.3 exclusive_scan() and
+// inclusive_scan() algoriths used with the operation MUL, bitwise OR, XOR, AND.
+
+#include "scan.hpp"
+
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  check_mul<class MulA, int>(Queue);
+  check_mul<class MulB, unsigned int>(Queue);
+  check_mul<class MulC, long>(Queue);
+  check_mul<class MulD, unsigned long>(Queue);
+  check_mul<class MulE, float>(Queue);
+
+  check_bit_ops<class A, int>(Queue);
+  check_bit_ops<class B, unsigned int>(Queue);
+  check_bit_ops<class C, unsigned>(Queue);
+  check_bit_ops<class D, long>(Queue);
+  check_bit_ops<class E, unsigned long>(Queue);
+  check_bit_ops<class F, long long>(Queue);
+  check_bit_ops<class G, unsigned long long>(Queue);
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
diff --git a/SYCL/SubGroup/scan_spirv13_fp16.cpp b/SYCL/SubGroup/scan_spirv13_fp16.cpp
new file mode 100644
index 0000000000..0532d06b8a
--- /dev/null
+++ b/SYCL/SubGroup/scan_spirv13_fp16.cpp
@@ -0,0 +1,20 @@
+// UNSUPPORTED: cuda
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// This test verifies the correct work of SPIR-V 1.3 exclusive_scan() and
+// inclusive_scan() algoriths used with the MUL operation.
+
+#include "scan.hpp"
+
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  check_mul<class MulHalf, cl::sycl::half>(Queue);
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
diff --git a/SYCL/SubGroup/scan_spirv13_fp64.cpp b/SYCL/SubGroup/scan_spirv13_fp64.cpp
new file mode 100644
index 0000000000..4d4102127a
--- /dev/null
+++ b/SYCL/SubGroup/scan_spirv13_fp64.cpp
@@ -0,0 +1,27 @@
+// UNSUPPORTED: cpu
+// #2252 Disable until all variants of built-ins are available in OpenCL CPU
+// runtime for every supported ISA
+
+// UNSUPPORTED: cuda
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test verifies the correct work of SPIR-V 1.3 exclusive_scan() and
+// inclusive_scan() algoriths used with the MUL operation.
+
+#include "scan.hpp"
+
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  check<class MulDouble, double>(Queue);
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}

From ac90159911f8c71b75f4f2e18482789ef7f7ca7e Mon Sep 17 00:00:00 2001
From: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
Date: Mon, 1 Mar 2021 13:29:57 -0800
Subject: [PATCH 3/4] Fix the check for sub-groups availability. It returned
 false even when device had support for sub-groups

Signed-off-by: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
---
 SYCL/SubGroup/helper.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/SYCL/SubGroup/helper.hpp b/SYCL/SubGroup/helper.hpp
index 9f4d29ad5e..964fb742bc 100644
--- a/SYCL/SubGroup/helper.hpp
+++ b/SYCL/SubGroup/helper.hpp
@@ -154,7 +154,7 @@ void exit_if_not_equal_vec(vec<T, N> val, vec<T, N> ref, const char *name) {
 }
 
 bool core_sg_supported(const device &Device) {
-  return (Device.has_extension("cl_khr_subgroups") ||
-          Device.get_info<info::device::version>().find(" 2.1") !=
-              string_class::npos);
+  if (Device.has_extension("cl_khr_subgroups"))
+    return true;
+  return Device.get_info<info::device::version>() >= "2.1";
 }

From 925ce426bb754423531f70b26482e9c98005857a Mon Sep 17 00:00:00 2001
From: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
Date: Tue, 2 Mar 2021 09:10:08 -0800
Subject: [PATCH 4/4] Add checks for cl_khr_fp16/fp64

Signed-off-by: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
---
 SYCL/SubGroup/reduce_fp16.cpp         | 12 ++++--------
 SYCL/SubGroup/reduce_fp64.cpp         | 12 ++++--------
 SYCL/SubGroup/reduce_spirv13_fp16.cpp |  3 ++-
 SYCL/SubGroup/reduce_spirv13_fp64.cpp |  3 ++-
 SYCL/SubGroup/scan_fp16.cpp           | 12 ++++--------
 SYCL/SubGroup/scan_fp64.cpp           | 12 ++++--------
 SYCL/SubGroup/scan_spirv13_fp16.cpp   |  3 ++-
 SYCL/SubGroup/scan_spirv13_fp64.cpp   |  3 ++-
 8 files changed, 24 insertions(+), 36 deletions(-)

diff --git a/SYCL/SubGroup/reduce_fp16.cpp b/SYCL/SubGroup/reduce_fp16.cpp
index 1d6e249eb1..323f3e63b2 100644
--- a/SYCL/SubGroup/reduce_fp16.cpp
+++ b/SYCL/SubGroup/reduce_fp16.cpp
@@ -1,18 +1,14 @@
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-///==---------- reduce_fp16.cpp - SYCL sub_group reduce test ----*- C++ -*--==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
+
+// This test verifies the correct work of the sub-group algorithm reduce().
 
 #include "reduce.hpp"
 
 int main() {
   queue Queue;
-  if (!core_sg_supported(Queue.get_device())) {
+  if (!core_sg_supported(Queue.get_device()) ||
+      !Queue.get_device().has_extension("cl_khr_fp16")) {
     std::cout << "Skipping test\n";
     return 0;
   }
diff --git a/SYCL/SubGroup/reduce_fp64.cpp b/SYCL/SubGroup/reduce_fp64.cpp
index 2e4699d35d..78f7994466 100644
--- a/SYCL/SubGroup/reduce_fp64.cpp
+++ b/SYCL/SubGroup/reduce_fp64.cpp
@@ -7,19 +7,15 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
-///==---------- reduce_fp64.cpp - SYCL sub_group reduce test ----*- C++ -*--==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
+
+// This test verifies the correct work of the sub-group algorithm reduce().
 
 #include "reduce.hpp"
 
 int main() {
   queue Queue;
-  if (!core_sg_supported(Queue.get_device())) {
+  if (!core_sg_supported(Queue.get_device()) ||
+      !Queue.get_device().has_extension("cl_khr_fp64")) {
     std::cout << "Skipping test\n";
     return 0;
   }
diff --git a/SYCL/SubGroup/reduce_spirv13_fp16.cpp b/SYCL/SubGroup/reduce_spirv13_fp16.cpp
index 6bce6cad09..e60826e99f 100644
--- a/SYCL/SubGroup/reduce_spirv13_fp16.cpp
+++ b/SYCL/SubGroup/reduce_spirv13_fp16.cpp
@@ -10,7 +10,8 @@
 
 int main() {
   queue Queue;
-  if (!core_sg_supported(Queue.get_device())) {
+  if (!core_sg_supported(Queue.get_device()) ||
+      !Queue.get_device().has_extension("cl_khr_fp16")) {
     std::cout << "Skipping test\n";
     return 0;
   }
diff --git a/SYCL/SubGroup/reduce_spirv13_fp64.cpp b/SYCL/SubGroup/reduce_spirv13_fp64.cpp
index 9c4591cefe..79ea2e1939 100644
--- a/SYCL/SubGroup/reduce_spirv13_fp64.cpp
+++ b/SYCL/SubGroup/reduce_spirv13_fp64.cpp
@@ -17,7 +17,8 @@
 
 int main() {
   queue Queue;
-  if (!core_sg_supported(Queue.get_device())) {
+  if (!core_sg_supported(Queue.get_device()) ||
+      !Queue.get_device().has_extension("cl_khr_fp64")) {
     std::cout << "Skipping test\n";
     return 0;
   }
diff --git a/SYCL/SubGroup/scan_fp16.cpp b/SYCL/SubGroup/scan_fp16.cpp
index 47bd49b6a0..dc73279f50 100644
--- a/SYCL/SubGroup/scan_fp16.cpp
+++ b/SYCL/SubGroup/scan_fp16.cpp
@@ -1,19 +1,15 @@
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-//==---------- scan_fp16.cpp - SYCL sub_group scan test --------*- C++ -*---==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
+// This test verifies the correct work of the sub-group algorithms
+// exclusive_scan() and inclusive_scan().
 
 #include "scan.hpp"
 
 int main() {
   queue Queue;
-  if (!core_sg_supported(Queue.get_device())) {
+  if (!core_sg_supported(Queue.get_device()) ||
+      !Queue.get_device().has_extension("cl_khr_fp16")) {
     std::cout << "Skipping test\n";
     return 0;
   }
diff --git a/SYCL/SubGroup/scan_fp64.cpp b/SYCL/SubGroup/scan_fp64.cpp
index 07409b3e82..14b1383f69 100644
--- a/SYCL/SubGroup/scan_fp64.cpp
+++ b/SYCL/SubGroup/scan_fp64.cpp
@@ -8,19 +8,15 @@
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-//==---------- scan_fp64.cpp - SYCL sub_group scan test --------*- C++ -*---==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
+// This test verifies the correct work of the sub-group algorithms
+// exclusive_scan() and inclusive_scan().
 
 #include "scan.hpp"
 
 int main() {
   queue Queue;
-  if (!core_sg_supported(Queue.get_device())) {
+  if (!core_sg_supported(Queue.get_device()) ||
+      !Queue.get_device().has_extension("cl_khr_fp64")) {
     std::cout << "Skipping test\n";
     return 0;
   }
diff --git a/SYCL/SubGroup/scan_spirv13_fp16.cpp b/SYCL/SubGroup/scan_spirv13_fp16.cpp
index 0532d06b8a..62265ab8c0 100644
--- a/SYCL/SubGroup/scan_spirv13_fp16.cpp
+++ b/SYCL/SubGroup/scan_spirv13_fp16.cpp
@@ -10,7 +10,8 @@
 
 int main() {
   queue Queue;
-  if (!core_sg_supported(Queue.get_device())) {
+  if (!core_sg_supported(Queue.get_device()) ||
+      !Queue.get_device().has_extension("cl_khr_fp16")) {
     std::cout << "Skipping test\n";
     return 0;
   }
diff --git a/SYCL/SubGroup/scan_spirv13_fp64.cpp b/SYCL/SubGroup/scan_spirv13_fp64.cpp
index 4d4102127a..c1bcbed831 100644
--- a/SYCL/SubGroup/scan_spirv13_fp64.cpp
+++ b/SYCL/SubGroup/scan_spirv13_fp64.cpp
@@ -17,7 +17,8 @@
 
 int main() {
   queue Queue;
-  if (!core_sg_supported(Queue.get_device())) {
+  if (!core_sg_supported(Queue.get_device()) ||
+      !Queue.get_device().has_extension("cl_khr_fp64")) {
     std::cout << "Skipping test\n";
     return 0;
   }