diff --git a/SYCL/Basic/aspects.cpp b/SYCL/Basic/aspects.cpp
new file mode 100644
index 0000000000..bc5b04f4ff
--- /dev/null
+++ b/SYCL/Basic/aspects.cpp
@@ -0,0 +1,106 @@
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: env SYCL_DEVICE_FILTER=%sycl_be %t.out
+//
+// Hip is missing some of the parameters tested here so it fails with ROCm for
+// NVIDIA
+// XFAIL: rocm_nvidia
+
+//==--------------- aspects.cpp - SYCL device test ------------------------==//
+//
+// Returns the various aspects of a device  and platform.
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <iostream>
+
+using namespace cl::sycl;
+
+// platform::has() calls device::has() for each device on the platform.
+
+int main() {
+  bool failed = false;
+  int pltIdx = 0;
+  for (const auto &plt : platform::get_platforms()) {
+    pltIdx++;
+    if (plt.has(aspect::host)) {
+      std::cout << "Platform #" << pltIdx
+                << " type: Host supports:" << std::endl;
+    } else if (plt.has(aspect::cpu)) {
+      std::cout << "Platform #" << pltIdx
+                << " type: CPU supports:" << std::endl;
+    } else if (plt.has(aspect::gpu)) {
+      std::cout << "Platform #" << pltIdx
+                << " type: GPU supports:" << std::endl;
+    } else if (plt.has(aspect::accelerator)) {
+      std::cout << "Platform #" << pltIdx
+                << " type: Accelerator supports:" << std::endl;
+    } else if (plt.has(aspect::custom)) {
+      std::cout << "Platform #" << pltIdx
+                << " type: Custom supports:" << std::endl;
+    } else {
+      failed = true;
+      std::cout << "Failed: platform #" << pltIdx << " type: unknown"
+                << std::endl;
+      return 1;
+    }
+
+    if (plt.has(aspect::fp16)) {
+      std::cout << "  fp16" << std::endl;
+    }
+    if (plt.has(aspect::fp64)) {
+      std::cout << "  fp64" << std::endl;
+    }
+    if (plt.has(aspect::int64_base_atomics)) {
+      std::cout << "  base atomic operations" << std::endl;
+    }
+    if (plt.has(aspect::int64_extended_atomics)) {
+      std::cout << "  extended atomic operations" << std::endl;
+    }
+    if (plt.has(aspect::atomic64)) {
+      std::cout << "  atomic64" << std::endl;
+    }
+    if (plt.has(aspect::image)) {
+      std::cout << "  images" << std::endl;
+    }
+    if (plt.has(aspect::online_compiler)) {
+      std::cout << "  online compiler" << std::endl;
+    }
+    if (plt.has(aspect::online_linker)) {
+      std::cout << "  online linker" << std::endl;
+    }
+    if (plt.has(aspect::queue_profiling)) {
+      std::cout << "  queue profiling" << std::endl;
+    }
+    if (plt.has(aspect::usm_device_allocations)) {
+      std::cout << "  USM allocations" << std::endl;
+    }
+    if (plt.has(aspect::usm_host_allocations)) {
+      std::cout << "  USM host allocations" << std::endl;
+    }
+    if (plt.has(aspect::usm_atomic_host_allocations)) {
+      std::cout << "  USM atomic host allocations" << std::endl;
+    }
+    if (plt.has(aspect::usm_shared_allocations)) {
+      std::cout << "  USM shared allocations" << std::endl;
+    }
+    if (plt.has(aspect::usm_atomic_shared_allocations)) {
+      std::cout << "  USM atomic shared allocations" << std::endl;
+    }
+    if (plt.has(aspect::usm_restricted_shared_allocations)) {
+      std::cout << "  USM restricted shared allocations" << std::endl;
+    }
+    if (plt.has(aspect::usm_system_allocator)) {
+      std::cout << "  USM system allocator" << std::endl;
+    }
+    if (plt.has(aspect::usm_system_allocations)) {
+      std::cout << "  USM system allocations" << std::endl;
+    }
+  }
+  std::cout << "Passed." << std::endl;
+  return 0;
+}
diff --git a/SYCL/Basic/diagnostics/device-check.cpp b/SYCL/Basic/diagnostics/device-check.cpp
new file mode 100644
index 0000000000..931881cc28
--- /dev/null
+++ b/SYCL/Basic/diagnostics/device-check.cpp
@@ -0,0 +1,51 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_DEVICE_FILTER=%sycl_be SYCL_DEVICE_TYPE=cpu %t.out
+// RUN: env SYCL_DEVICE_FILTER=%sycl_be SYCL_DEVICE_TYPE=gpu %t.out
+// RUN: env SYCL_DEVICE_FILTER=%sycl_be SYCL_DEVICE_TYPE=acc %t.out
+// RUN: env SYCL_DEVICE_FILTER=%sycl_be SYCL_DEVICE_TYPE=host %t.out
+// RUN: env SYCL_DEVICE_FILTER=%sycl_be SYCL_DEVICE_TYPE=CPU %t.out
+// RUN: env SYCL_DEVICE_FILTER=%sycl_be SYCL_DEVICE_TYPE=GPU %t.out
+// RUN: env SYCL_DEVICE_FILTER=%sycl_be SYCL_DEVICE_TYPE=ACC %t.out
+// RUN: env SYCL_DEVICE_FILTER=%sycl_be SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: env SYCL_DEVICE_FILTER=%sycl_be SYCL_DEVICE_TYPE=Cpu %t.out
+// RUN: env SYCL_DEVICE_FILTER=%sycl_be SYCL_DEVICE_TYPE=Gpu %t.out
+// RUN: env SYCL_DEVICE_FILTER=%sycl_be SYCL_DEVICE_TYPE=Acc %t.out
+// RUN: env SYCL_DEVICE_FILTER=%sycl_be SYCL_DEVICE_TYPE=Host %t.out
+// RUN: env SYCL_DEVICE_FILTER=%sycl_be SYCL_DEVICE_TYPE=XPU %t.out
+
+//==------------------- device-check.cpp --------------------------==//
+// This is a diagnostic test which ensures that
+// device types are case-insensitive.
+// It also checks for SYCL_DEVICE being set incorrectly.
+//==---------------------------------------------------------------==//
+
+#include <CL/sycl.hpp>
+#include <iostream>
+
+using namespace cl::sycl;
+
+int main() {
+  try {
+    queue q = queue();
+    auto device = q.get_device();
+    auto deviceName = device.get_info<cl::sycl::info::device::name>();
+    std::cout << " Device Name: " << deviceName << std::endl;
+  }
+
+  catch (runtime_error &E) {
+    if (std::string(E.what()).find("SYCL_DEVICE_TYPE is not recognized.  Must "
+                                   "be GPU, CPU, ACC or HOST.") ==
+            std::string::npos &&
+        std::string(E.what()).find("No device of requested type available.") ==
+            std::string::npos) {
+      std::cout << "Test failed: received error is incorrect." << std::endl;
+      return 1;
+    } else {
+      std::cout << "Test passed: caught the expected error." << std::endl;
+      return 0;
+    }
+  }
+
+  std::cout << "Test passed: results are correct." << std::endl;
+  return 0;
+}
diff --git a/SYCL/Basic/image/srgba-read.cpp b/SYCL/Basic/image/srgba-read.cpp
new file mode 100644
index 0000000000..dd377af36d
--- /dev/null
+++ b/SYCL/Basic/image/srgba-read.cpp
@@ -0,0 +1,136 @@
+// RUN: %clangxx -fsycl  -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out %CPU_CHECK_PLACEHOLDER
+// RUN: %GPU_RUN_PLACEHOLDER %t.out %GPU_CHECK_PLACEHOLDER
+
+// XFAIL: level_zero
+// UNSUPPORTED: cuda
+// UNSUPPORTED: rocm_nvidia
+// UNSUPPORTED: rocm_amd
+
+#include <CL/sycl.hpp>
+
+using namespace cl::sycl;
+
+using accessorPixelT = sycl::float4;
+using dataPixelT = uint32_t;
+
+// will output a pixel as {r,g,b,a}.  provide override if a different pixelT is
+// defined.
+void outputPixel(sycl::float4 somePixel) {
+  std::cout << "{" << somePixel[0] << "," << somePixel[1] << "," << somePixel[2]
+            << "," << somePixel[3] << "} ";
+}
+
+constexpr long width = 4;
+constexpr long height = 3;
+
+void test_rd(image_channel_order ChanOrder, image_channel_type ChanType) {
+
+  int numTests = 4; // drives the size of the testResults buffer, and the number
+                    // of report iterations. Kludge.
+
+  // this should yield a read of approximate 0.5 for each channel
+  // when read directly with a normal non-linearized image (e.g.
+  // image_channel_order::rgba). For sRGB
+  // (image_channel_order::ext_oneapi_srgba), this is the value with maximum
+  // conversion. So we should read values of approximately 0.2
+  dataPixelT basicPixel{127 << 24 | 127 << 16 | 127 << 8 | 127};
+
+  queue Q;
+  const sycl::range<2> ImgRange_2D(width, height);
+
+  // IMPORTANT: const data is *required* for sRGBA images.
+  // OpenCL support is limited for 2D/3D images that are read only.
+  const std::vector<dataPixelT> ImgData(ImgRange_2D.size(), basicPixel);
+  try { // closure
+
+    image<2> image_2D(ImgData.data(), ChanOrder, ChanType, ImgRange_2D);
+    // use a buffer to report back test results.
+    buffer<accessorPixelT, 1> testResults((range<1>(numTests)));
+
+    Q.submit([&](handler &cgh) {
+      auto image_acc =
+          image_2D.get_access<accessorPixelT, access::mode::read>(cgh);
+      auto test_acc = testResults.get_access<access::mode::write>(cgh);
+
+      cgh.single_task<class im2D_rw>([=]() {
+        int i = 0; // the index for writing into the testResult buffer.
+
+        // verify our four pixels were set up correctly.
+        // 0-3 read four pixels. no sampler
+        test_acc[i++] = image_acc.read(sycl::int2{0, 0});
+        test_acc[i++] = image_acc.read(sycl::int2{1, 0});
+        test_acc[i++] = image_acc.read(sycl::int2{0, 1});
+        test_acc[i++] = image_acc.read(sycl::int2{2, 2});
+      });
+    });
+    Q.wait_and_throw();
+
+    // REPORT RESULTS
+    auto test_acc = testResults.get_access<access::mode::read>();
+    for (int i = 0, idx = 0; i < numTests; i++, idx++) {
+      if (i == 0) {
+        idx = 0;
+        std::cout << "read four pixels, no sampler" << std::endl;
+      }
+
+      accessorPixelT testPixel = test_acc[i];
+      std::cout << i << /* " -- " << idx << */ ": ";
+      outputPixel(testPixel);
+      std::cout << std::endl;
+    }
+  } catch (sycl::exception e) {
+    std::cout << "exception caught: " << e.what() << std::endl;
+  } // ~image / ~buffer
+}
+
+int main() {
+
+#ifdef SYCL_EXT_ONEAPI_SRGB
+  std::cout << "SYCL_EXT_ONEAPI_SRGB defined" << std::endl;
+#endif
+
+  queue Q;
+  device D = Q.get_device();
+
+  // test aspect
+  if (D.has(aspect::ext_oneapi_srgb))
+    std::cout << "aspect::ext_oneapi_srgb detected" << std::endl;
+
+  if (D.has(aspect::image)) {
+    // RGBA -- (normal, non-linearized)
+    std::cout << "rgba -------" << std::endl;
+    test_rd(image_channel_order::rgba, image_channel_type::unorm_int8);
+
+    // sRGBA -- (linearized reads)
+    std::cout << "srgba -------" << std::endl;
+    test_rd(image_channel_order::ext_oneapi_srgba,
+            image_channel_type::unorm_int8);
+  } else {
+    std::cout << "device does not support image operations" << std::endl;
+  }
+
+  return 0;
+}
+
+// clang-format off
+// CHECK: SYCL_EXT_ONEAPI_SRGB defined
+// CHECK: aspect::ext_oneapi_srgb detected
+
+// CHECK: rgba -------
+// CHECK-NEXT: read four pixels, no sampler
+//   these next four reads should all be close to 0.5
+// CHECK-NEXT: 0: {0.498039,0.498039,0.498039,0.498039} 
+// CHECK-NEXT: 1: {0.498039,0.498039,0.498039,0.498039} 
+// CHECK-NEXT: 2: {0.498039,0.498039,0.498039,0.498039} 
+// CHECK-NEXT: 3: {0.498039,0.498039,0.498039,0.498039} 
+// CHECK: srgba -------
+// CHECK-NEXT: read four pixels, no sampler
+//   these next four reads should have R, G, B values close to 0.2 
+//   presently the values differ slightly between OpenCL GPU and CPU
+// (e.g. GPU: 0.21231, CPU: 0.211795 )
+// CHECK-NEXT: 0: {0.21 
+// CHECK-NEXT: 1: {0.21 
+// CHECK-NEXT: 2: {0.21 
+// CHECK-NEXT: 3: {0.21
+// clang-format on
diff --git a/SYCL/Basic/intel-ext-device.cpp b/SYCL/Basic/intel-ext-device.cpp
new file mode 100644
index 0000000000..e64cb21eee
--- /dev/null
+++ b/SYCL/Basic/intel-ext-device.cpp
@@ -0,0 +1,114 @@
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: env SYCL_DEVICE_FILTER=level_zero:gpu %t.out
+// RUN: env SYCL_DEVICE_FILTER=opencl:gpu %t.out
+//
+// REQUIRES: gpu
+// UNSUPPORTED: cuda
+// UNSUPPORTED: rocm_nvidia
+// UNSUPPORTED: rocm_amd
+
+//==--------- intel-ext-device.cpp - SYCL device test ------------==//
+//
+// Returns the low-level device details.  These are Intel-specific extensions
+// that are only supported on Level Zero.
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+#include <iostream>
+
+using namespace cl::sycl;
+
+#ifdef _WIN32
+#define setenv(name, value, overwrite) _putenv_s(name, value)
+#endif
+
+int main(int argc, char **argv) {
+  // Must be enabled at the beginning of the application
+  // to obtain the PCI address
+  setenv("SYCL_ENABLE_PCI", "1", 0);
+
+  int pltCount = 1;
+  for (const auto &plt : platform::get_platforms()) {
+    if (!plt.has(aspect::host)) {
+      int devCount = 1;
+      int totalEUs = 0;
+      int numSlices = 0;
+      int numSubslices = 0;
+      int numEUsPerSubslice = 0;
+      for (const auto &dev : plt.get_devices()) {
+        std::cout << "Platform #" << pltCount++ << ":" << std::endl;
+        if (dev.has(aspect::gpu)) {
+          auto name = dev.get_info<info::device::name>();
+          std::cout << "Device #" << devCount++ << ": "
+                    << dev.get_info<info::device::name>() << ":" << std::endl;
+
+          std::cout << "Backend: ";
+          if (plt.get_backend() == backend::level_zero) {
+            std::cout << "Level Zero" << std::endl;
+          } else if (plt.get_backend() == backend::opencl) {
+            std::cout << "OpenCL" << std::endl;
+          } else if (plt.get_backend() == backend::cuda) {
+            std::cout << "CUDA" << std::endl;
+          } else {
+            std::cout << "Unknown" << std::endl;
+          }
+
+          // Use Feature Test macro to see if extensions are supported.
+          if (SYCL_EXT_INTEL_DEVICE_INFO >= 1) {
+
+            if (dev.has(aspect::ext_intel_pci_address)) {
+              std::cout << "PCI address = "
+                        << dev.get_info<info::device::ext_intel_pci_address>()
+                        << std::endl;
+            }
+            if (dev.has(aspect::ext_intel_gpu_eu_count)) {
+              totalEUs = dev.get_info<info::device::ext_intel_gpu_eu_count>();
+              std::cout << "Number of EUs = " << totalEUs << std::endl;
+            }
+            if (dev.has(aspect::ext_intel_gpu_eu_simd_width)) {
+              int w = dev.get_info<info::device::ext_intel_gpu_eu_simd_width>();
+              std::cout << "EU SIMD width = " << w << std::endl;
+            }
+            if (dev.has(aspect::ext_intel_gpu_slices)) {
+              numSlices = dev.get_info<info::device::ext_intel_gpu_slices>();
+              std::cout << "Number of slices = " << numSlices << std::endl;
+            }
+            if (dev.has(aspect::ext_intel_gpu_subslices_per_slice)) {
+              numSubslices = dev.get_info<
+                  info::device::ext_intel_gpu_subslices_per_slice>();
+              std::cout << "Number of subslices per slice = " << numSubslices
+                        << std::endl;
+            }
+            if (dev.has(aspect::ext_intel_gpu_eu_count_per_subslice)) {
+              numEUsPerSubslice = dev.get_info<
+                  info::device::ext_intel_gpu_eu_count_per_subslice>();
+              std::cout << "Number of EUs per subslice = " << numEUsPerSubslice
+                        << std::endl;
+            }
+            if (dev.has(aspect::ext_intel_max_mem_bandwidth)) {
+              // not supported yet
+              long m =
+                  dev.get_info<info::device::ext_intel_max_mem_bandwidth>();
+              std::cout << "Maximum memory bandwidth = " << m << std::endl;
+            }
+            // This is the only data we can verify.
+            if (totalEUs != numSlices * numSubslices * numEUsPerSubslice) {
+              std::cout << "Error: EU Count is incorrect!" << std::endl;
+              std::cout << "Failed!" << std::endl;
+              return 1;
+            }
+          } // SYCL_EXT_INTEL_DEVICE_INFO
+        }
+        std::cout << std::endl;
+      }
+    }
+  }
+  std::cout << "Passed!" << std::endl;
+  return 0;
+}
diff --git a/SYCL/Basic/interop/construction_ocl.cpp b/SYCL/Basic/interop/construction_ocl.cpp
new file mode 100644
index 0000000000..b1f1cb65ea
--- /dev/null
+++ b/SYCL/Basic/interop/construction_ocl.cpp
@@ -0,0 +1,160 @@
+// REQUIRES: opencl, opencl_icd
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %opencl_lib %s -o %t.ocl.out
+// RUN: env SYCL_DEVICE_FILTER="opencl" %t.ocl.out
+
+#include <CL/cl.h>
+#include <CL/sycl/backend/opencl.hpp>
+
+#include <sycl/sycl.hpp>
+
+constexpr auto BE = sycl::backend::opencl;
+
+int main() {
+  sycl::device Dev{sycl::default_selector{}};
+  auto NativeDev = Dev.get_native<BE>();
+
+  sycl::device NewDev = sycl::make_device<BE>(NativeDev);
+  assert(NewDev.get_info<sycl::info::device::name>() ==
+         Dev.get_info<sycl::info::device::name>());
+
+  sycl::platform Plt = Dev.get_platform();
+  auto NativePlt = Plt.get_native<BE>();
+
+  sycl::platform NewPlt = sycl::make_platform<BE>(NativePlt);
+  assert(NewPlt == Plt);
+
+  sycl::context Ctx{Dev};
+  auto NativeCtx = Ctx.get_native<BE>();
+
+  sycl::context NewCtx = sycl::make_context<BE>(NativeCtx);
+  assert(NewCtx.get_native<BE>() == NativeCtx);
+
+  sycl::queue Q{Ctx, Dev};
+  auto NativeQ = Q.get_native<BE>();
+
+  sycl::queue NewQ = sycl::make_queue<BE>(NativeQ, Ctx);
+  assert(NativeQ == NewQ.get_native<BE>());
+
+  sycl::event Evt = Q.single_task<class Tst>([] {});
+  auto NativeEvt = Evt.get_native<BE>();
+
+  sycl::event NewEvt = sycl::make_event<BE>(NativeEvt, Ctx);
+  assert(NativeEvt == NewEvt.get_native<BE>());
+
+  cl_mem NativeBuf =
+      clCreateBuffer(NativeCtx, CL_MEM_READ_WRITE, 128, nullptr, nullptr);
+  auto NewBuf = sycl::make_buffer<BE, char>(NativeBuf, Ctx);
+  assert(NewBuf.get_range()[0] == 128);
+
+  const char *ProgSrc = "kernel void _() {}";
+  cl_int Err;
+
+  // Program in state NONE
+  {
+    cl_program OclProg =
+        clCreateProgramWithSource(NativeCtx, 1, &ProgSrc, nullptr, &Err);
+    assert(Err == CL_SUCCESS && "Program creation failed");
+
+    auto KB =
+        sycl::make_kernel_bundle<BE, sycl::bundle_state::input>(OclProg, Ctx);
+    auto KernelIDs = KB.get_kernel_ids();
+    assert(KernelIDs.empty());
+
+    cl_program OclProg2 =
+        clCreateProgramWithSource(NativeCtx, 1, &ProgSrc, nullptr, &Err);
+    assert(Err == CL_SUCCESS && "Program creation failed");
+
+    auto KB2 =
+        sycl::make_kernel_bundle<BE, sycl::bundle_state::object>(OclProg2, Ctx);
+    auto KernelIDs2 = KB2.get_kernel_ids();
+    assert(KernelIDs2.empty());
+
+    cl_program OclProg3 =
+        clCreateProgramWithSource(NativeCtx, 1, &ProgSrc, nullptr, &Err);
+    assert(Err == CL_SUCCESS && "Program creation failed");
+
+    auto KB3 = sycl::make_kernel_bundle<BE, sycl::bundle_state::executable>(
+        OclProg3, Ctx);
+    auto KernelIDs3 = KB3.get_kernel_ids();
+    assert(KernelIDs3.empty());
+  }
+
+  // Compiled program
+  {
+    cl_program OclProg =
+        clCreateProgramWithSource(NativeCtx, 1, &ProgSrc, nullptr, &Err);
+    assert(Err == CL_SUCCESS && "Program creation failed");
+
+    Err = clCompileProgram(OclProg, 1, &NativeDev, "", 0, nullptr, nullptr,
+                           nullptr, nullptr);
+    assert(Err == CL_SUCCESS && "Program compile failed");
+
+    auto KB =
+        sycl::make_kernel_bundle<BE, sycl::bundle_state::object>(OclProg, Ctx);
+    auto KernelIDs = KB.get_kernel_ids();
+    assert(KernelIDs.empty());
+
+    bool StateMismatch = false;
+    try {
+      auto KB2 =
+          sycl::make_kernel_bundle<BE, sycl::bundle_state::input>(OclProg, Ctx);
+    } catch (sycl::runtime_error Ex) {
+      StateMismatch = true;
+    }
+    assert(StateMismatch);
+
+    cl_program OclProg3 =
+        clCreateProgramWithSource(NativeCtx, 1, &ProgSrc, nullptr, &Err);
+    assert(Err == CL_SUCCESS && "Program creation failed");
+
+    Err = clCompileProgram(OclProg3, 1, &NativeDev, "", 0, nullptr, nullptr,
+                           nullptr, nullptr);
+    assert(Err == CL_SUCCESS && "Program compile failed");
+
+    auto KB3 = sycl::make_kernel_bundle<BE, sycl::bundle_state::executable>(
+        OclProg3, Ctx);
+    auto KernelIDs3 = KB3.get_kernel_ids();
+    assert(KernelIDs3.empty());
+  }
+
+  // Linked program
+  {
+    cl_program OclProg =
+        clCreateProgramWithSource(NativeCtx, 1, &ProgSrc, nullptr, &Err);
+    assert(Err == CL_SUCCESS && "Program creation failed");
+
+    Err = clBuildProgram(OclProg, 1, &NativeDev, "", nullptr, nullptr);
+    assert(Err == CL_SUCCESS && "Program build failed");
+
+    auto KB = sycl::make_kernel_bundle<BE, sycl::bundle_state::executable>(
+        OclProg, Ctx);
+    auto KernelIDs = KB.get_kernel_ids();
+    assert(KernelIDs.empty());
+
+    cl_kernel NativeKer = clCreateKernel(OclProg, "_", &Err);
+    assert(Err == CL_SUCCESS && "Kernel creation failed");
+
+    auto Kernel = sycl::make_kernel<BE>(NativeKer, Ctx);
+    assert(Kernel.get_info<sycl::info::kernel::num_args>() == 0);
+
+    bool StateMismatch = false;
+    try {
+      auto KB2 =
+          sycl::make_kernel_bundle<BE, sycl::bundle_state::input>(OclProg, Ctx);
+    } catch (sycl::runtime_error Ex) {
+      StateMismatch = true;
+    }
+    assert(StateMismatch);
+
+    StateMismatch = false;
+    try {
+      auto KB3 = sycl::make_kernel_bundle<BE, sycl::bundle_state::object>(
+          OclProg, Ctx);
+    } catch (sycl::runtime_error Ex) {
+      StateMismatch = true;
+    }
+    assert(StateMismatch);
+  }
+
+  return 0;
+}
diff --git a/SYCL/Basic/interop/construction_ze.cpp b/SYCL/Basic/interop/construction_ze.cpp
new file mode 100644
index 0000000000..55e2933586
--- /dev/null
+++ b/SYCL/Basic/interop/construction_ze.cpp
@@ -0,0 +1,28 @@
+// REQUIRES: level_zero, level_zero_dev_kit
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %level_zero_options %s -o %t.ze.out
+// RUN: env SYCL_DEVICE_FILTER="level_zero" %t.ze.out
+
+#include <level_zero/ze_api.h>
+
+#include <CL/sycl/backend/level_zero.hpp>
+#include <sycl/sycl.hpp>
+
+constexpr auto BE = sycl::backend::level_zero;
+
+int main() {
+  sycl::device Dev{sycl::default_selector{}};
+
+  sycl::queue Q{Dev};
+
+  if (0) {
+    Q.submit([](sycl::handler &CGH) { CGH.single_task<class T>([] {}); });
+  }
+
+  sycl::platform Plt = Dev.get_platform();
+  auto NativePlt = Plt.get_native<BE>();
+
+  sycl::platform NewPlt = sycl::make_platform<BE>(NativePlt);
+  assert(NewPlt == Plt);
+
+  return 0;
+}
diff --git a/SYCL/Basic/interop/get_native_ocl.cpp b/SYCL/Basic/interop/get_native_ocl.cpp
new file mode 100644
index 0000000000..da6c85dc46
--- /dev/null
+++ b/SYCL/Basic/interop/get_native_ocl.cpp
@@ -0,0 +1,36 @@
+// REQUIRES: opencl, opencl_dev_kit
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %opencl_options %s -o %t.ocl.out
+// RUN: %t.ocl.out
+
+#include <CL/cl.h>
+
+#include <CL/sycl/backend/opencl.hpp>
+#include <sycl/sycl.hpp>
+
+constexpr auto BE = sycl::backend::opencl;
+
+class TestKernel;
+
+int main() {
+  sycl::queue Q;
+
+  if (0) {
+    Q.submit([](sycl::handler &CGH) { CGH.single_task<TestKernel>([] {}); });
+  }
+
+  sycl::kernel_id KernelID = sycl::get_kernel_id<TestKernel>();
+
+  sycl::kernel_bundle KernelBundle =
+      sycl::get_kernel_bundle<sycl::bundle_state::executable>(Q.get_context());
+
+  sycl::kernel Kernel = KernelBundle.get_kernel(KernelID);
+
+  cl_kernel Handle = Kernel.get_native<BE>();
+
+  size_t Size = 0;
+  cl_int Err =
+      clGetKernelInfo(Handle, CL_KERNEL_FUNCTION_NAME, 0, nullptr, &Size);
+  assert(Err == CL_SUCCESS);
+
+  return 0;
+}
diff --git a/SYCL/Basic/interop/get_native_ze.cpp b/SYCL/Basic/interop/get_native_ze.cpp
new file mode 100644
index 0000000000..a8911ddd9c
--- /dev/null
+++ b/SYCL/Basic/interop/get_native_ze.cpp
@@ -0,0 +1,35 @@
+// REQUIRES: level_zero, level_zero_dev_kit
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %level_zero_options %s -o %t.ze.out
+// RUN: %t.ze.out
+
+#include <level_zero/ze_api.h>
+
+#include <CL/sycl/backend/level_zero.hpp>
+#include <sycl/sycl.hpp>
+
+constexpr auto BE = sycl::backend::level_zero;
+
+class TestKernel;
+
+int main() {
+  sycl::queue Q;
+
+  if (0) {
+    Q.submit([](sycl::handler &CGH) { CGH.single_task<TestKernel>([] {}); });
+  }
+
+  sycl::kernel_id KernelID = sycl::get_kernel_id<TestKernel>();
+
+  sycl::kernel_bundle KernelBundle =
+      sycl::get_kernel_bundle<sycl::bundle_state::executable>(Q.get_context());
+
+  sycl::kernel Kernel = KernelBundle.get_kernel(KernelID);
+
+  ze_kernel_handle_t Handle = Kernel.get_native<BE>();
+
+  ze_kernel_properties_t KernelProperties;
+  ze_result_t Err = zeKernelGetProperties(Handle, &KernelProperties);
+  assert(Err == ZE_RESULT_SUCCESS);
+
+  return 0;
+}
diff --git a/SYCL/Basic/interop/traits.cpp b/SYCL/Basic/interop/traits.cpp
new file mode 100644
index 0000000000..92c824c696
--- /dev/null
+++ b/SYCL/Basic/interop/traits.cpp
@@ -0,0 +1,69 @@
+// RUN: %clangxx -fsycl -DUSE_OPENCL %s
+// RUN: %clangxx -fsycl -DUSE_L0 %s
+// RUN: %clangxx -fsycl -DUSE_CUDA %s
+
+#ifdef USE_OPENCL
+#include <CL/cl.h>
+
+#include <CL/sycl/backend/opencl.hpp>
+
+constexpr auto Backend = sycl::backend::opencl;
+#endif
+
+#ifdef USE_L0
+#include <level_zero/ze_api.h>
+
+#include <CL/sycl/backend/level_zero.hpp>
+
+constexpr auto Backend = sycl::backend::level_zero;
+#endif
+
+#ifdef USE_CUDA
+#include <CL/sycl/backend/cuda.hpp>
+
+constexpr auto Backend = sycl::backend::cuda;
+#endif
+
+#include <sycl/sycl.hpp>
+
+int main() {
+#ifdef USE_OPENCL
+  static_assert(
+      std::is_same_v<sycl::backend_traits<Backend>::input_type<sycl::device>,
+                     sycl::interop<Backend, sycl::device>::type>);
+  static_assert(
+      std::is_same_v<sycl::backend_traits<Backend>::input_type<sycl::context>,
+                     sycl::interop<Backend, sycl::context>::type>);
+  static_assert(
+      std::is_same_v<sycl::backend_traits<Backend>::input_type<sycl::queue>,
+                     sycl::interop<Backend, sycl::queue>::type>);
+  static_assert(std::is_same_v<
+                sycl::backend_traits<Backend>::input_type<sycl::buffer<int, 2>>,
+                sycl::interop<Backend, sycl::buffer<int, 2>>::type>);
+  static_assert(
+      std::is_same_v<sycl::backend_traits<Backend>::input_type<sycl::event>,
+                     sycl::interop<Backend, sycl::event>::type>);
+#endif
+
+// CUDA does not have a native type for platforms
+#ifndef USE_CUDA
+  static_assert(
+      std::is_same_v<sycl::backend_traits<Backend>::input_type<sycl::platform>,
+                     sycl::interop<Backend, sycl::platform>::type>);
+  static_assert(
+      std::is_same_v<sycl::backend_traits<Backend>::return_type<sycl::platform>,
+                     sycl::interop<Backend, sycl::platform>::type>);
+#endif
+
+  static_assert(
+      std::is_same_v<sycl::backend_traits<Backend>::return_type<sycl::device>,
+                     sycl::interop<Backend, sycl::device>::type>);
+  static_assert(
+      std::is_same_v<sycl::backend_traits<Backend>::return_type<sycl::context>,
+                     sycl::interop<Backend, sycl::context>::type>);
+  static_assert(
+      std::is_same_v<sycl::backend_traits<Backend>::return_type<sycl::queue>,
+                     sycl::interop<Backend, sycl::queue>::type>);
+
+  return 0;
+}
diff --git a/SYCL/Basic/platform.cpp b/SYCL/Basic/platform.cpp
new file mode 100644
index 0000000000..f6ee302c19
--- /dev/null
+++ b/SYCL/Basic/platform.cpp
@@ -0,0 +1,80 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_DEVICE_FILTER=host,%sycl_be %t.out
+//==--------------- platform.cpp - SYCL platform test ----------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include <CL/sycl.hpp>
+#include <cassert>
+#include <iostream>
+#include <typeinfo>
+
+using namespace cl::sycl;
+
+int main() {
+  int i = 1;
+  vector_class<platform> openclPlatforms;
+  for (const auto &plt : platform::get_platforms()) {
+    std::cout << "Platform " << i++
+              << " is available: " << ((plt.is_host()) ? "host: " : "OpenCL: ")
+              << std::hex
+              << ((plt.is_host() ||
+                   plt.get_backend() != cl::sycl::backend::opencl)
+                      ? nullptr
+                      : plt.get())
+              << std::endl;
+  }
+
+  auto platforms = platform::get_platforms();
+  platform &platformA = platforms[0];
+  platform &platformB = (platforms.size() > 1 ? platforms[1] : platforms[0]);
+  {
+    std::cout << "move constructor" << std::endl;
+    platform Platform(platformA);
+    size_t hash = hash_class<platform>()(Platform);
+    platform MovedPlatform(std::move(Platform));
+    assert(hash == hash_class<platform>()(MovedPlatform));
+    assert(platformA.is_host() == MovedPlatform.is_host());
+    if (!platformA.is_host() &&
+        platformA.get_backend() == cl::sycl::backend::opencl) {
+      assert(MovedPlatform.get() != nullptr);
+    }
+  }
+  {
+    std::cout << "move assignment operator" << std::endl;
+    platform Platform(platformA);
+    size_t hash = hash_class<platform>()(Platform);
+    platform WillMovedPlatform(platformB);
+    WillMovedPlatform = std::move(Platform);
+    assert(hash == hash_class<platform>()(WillMovedPlatform));
+    assert(platformA.is_host() == WillMovedPlatform.is_host());
+    if (!platformA.is_host() &&
+        platformA.get_backend() == cl::sycl::backend::opencl) {
+      assert(WillMovedPlatform.get() != nullptr);
+    }
+  }
+  {
+    std::cout << "copy constructor" << std::endl;
+    platform Platform(platformA);
+    size_t hash = hash_class<platform>()(Platform);
+    platform PlatformCopy(Platform);
+    assert(hash == hash_class<platform>()(Platform));
+    assert(hash == hash_class<platform>()(PlatformCopy));
+    assert(Platform == PlatformCopy);
+    assert(Platform.is_host() == PlatformCopy.is_host());
+  }
+  {
+    std::cout << "copy assignment operator" << std::endl;
+    platform Platform(platformA);
+    size_t hash = hash_class<platform>()(Platform);
+    platform WillPlatformCopy(platformB);
+    WillPlatformCopy = Platform;
+    assert(hash == hash_class<platform>()(Platform));
+    assert(hash == hash_class<platform>()(WillPlatformCopy));
+    assert(Platform == WillPlatformCopy);
+    assert(Platform.is_host() == WillPlatformCopy.is_host());
+  }
+}
diff --git a/SYCL/Basic/span.cpp b/SYCL/Basic/span.cpp
new file mode 100644
index 0000000000..f7c4091244
--- /dev/null
+++ b/SYCL/Basic/span.cpp
@@ -0,0 +1,112 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//
+// Fails to release USM pointer on ROCm for NVIDIA
+// XFAIL: rocm_nvidia
+
+#include <numeric>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+void testSpanCapture() {
+  // This test creates spans that are backed by USM.
+  // ensures they can be captured by device lambda
+  // and that read and write operations function correctly
+  // across capture.
+  queue Q;
+
+  constexpr long numReadTests = 2;
+  const range<1> NumberOfReadTestsRange(numReadTests);
+  buffer<int, 1> SpanRead(NumberOfReadTestsRange);
+
+  // span from a vector
+  // We will create a vector, backed by a USM allocator. And a span from that.
+  using vec_alloc = usm_allocator<int, usm::alloc::shared>;
+  // Create allocator for device associated with q
+  vec_alloc myAlloc(Q);
+  // Create std vector with the allocator
+  std::vector<int, vec_alloc> vecUSM(4, myAlloc);
+  std::iota(vecUSM.begin(), vecUSM.end(), 1);
+  sycl::span<int> vecUSM_span{vecUSM};
+  vecUSM_span[0] += 100; // 101  modify first value using span affordance.
+
+  // span from USM memory
+  auto *usm_data = malloc_shared<int>(4, Q);
+  sycl::span<int> usm_span(usm_data, 4);
+  std::iota(usm_span.begin(), usm_span.end(), 1);
+  usm_span[0] += 100; // 101 modify first value using span affordance.
+
+  event E = Q.submit([&](handler &cgh) {
+    auto can_read_from_span_acc = SpanRead.get_access<access::mode::write>(cgh);
+    cgh.single_task<class hi>([=] {
+      // read from the spans.
+      can_read_from_span_acc[0] = vecUSM_span[0];
+      can_read_from_span_acc[1] = usm_span[0];
+
+      // write to the spans
+      vecUSM_span[1] += 1000;
+      usm_span[1] += 1000;
+    });
+  });
+  E.wait();
+
+  // check out the read operations, should have gotten 101 from each
+  auto can_read_from_span_acc = SpanRead.get_access<access::mode::read>();
+  for (int i = 0; i < numReadTests; i++) {
+    assert(can_read_from_span_acc[i] == 101 &&
+           "read check should have gotten 100");
+  }
+
+  // were the spans successfully modified via write?
+  assert(vecUSM_span[1] == 1002 &&
+         "vecUSM_span write check should have gotten 1001");
+  assert(usm_span[1] == 1002 && "usm_span write check should have gotten 1001");
+
+  free(usm_data, Q);
+}
+
+void set_all_span_values(sycl::span<int> container, int v) {
+  for (auto &e : container)
+    e = v;
+}
+
+void testSpanOnDevice() {
+  // this test creates a simple span on device,
+  // passes it to a function that operates on it
+  // and ensures it worked correctly
+  queue Q;
+  constexpr long numReadTests = 4;
+  const range<1> NumberOfReadTestsRange(numReadTests);
+  buffer<int, 1> SpanRead(NumberOfReadTestsRange);
+
+  event E = Q.submit([&](handler &cgh) {
+    auto can_read_from_span_acc = SpanRead.get_access<access::mode::write>(cgh);
+    cgh.single_task<class ha>([=] {
+      // create a span on device, pass it to function that modifies it
+      // read values back out.
+      int a[]{1, 2, 3, 4};
+      sycl::span<int> a_span{a};
+      set_all_span_values(a_span, 10);
+      for (int i = 0; i < numReadTests; i++)
+        can_read_from_span_acc[i] = a_span[i];
+    });
+  });
+  E.wait();
+
+  // check out the read operations, should have gotten 10 from each
+  auto can_read_from_span_acc = SpanRead.get_access<access::mode::read>();
+  for (int i = 0; i < numReadTests; i++) {
+    assert(can_read_from_span_acc[i] == 10 &&
+           "read check should have gotten 10");
+  }
+}
+
+int main() {
+  testSpanCapture();
+  testSpanOnDevice();
+
+  return 0;
+}
diff --git a/SYCL/GroupAlgorithm/SYCL2020/all_of.cpp b/SYCL/GroupAlgorithm/SYCL2020/all_of.cpp
new file mode 100644
index 0000000000..80723b5f3f
--- /dev/null
+++ b/SYCL/GroupAlgorithm/SYCL2020/all_of.cpp
@@ -0,0 +1,66 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//
+// Missing __spirv_GroupAll on AMD:
+// XFAIL: rocm_amd
+
+#include "support.h"
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+using namespace sycl;
+
+template <class Predicate> class all_of_kernel;
+
+struct IsEven {
+  bool operator()(int i) const { return (i % 2) == 0; }
+};
+
+template <typename InputContainer, typename OutputContainer, class Predicate>
+void test(queue q, InputContainer input, OutputContainer output,
+          Predicate pred) {
+  typedef class all_of_kernel<Predicate> kernel_name;
+  size_t N = input.size();
+  size_t G = 64;
+  {
+    buffer<int> in_buf(input.data(), input.size());
+    buffer<bool> out_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      accessor in{in_buf, cgh, sycl::read_only};
+      accessor out{out_buf, cgh, sycl::write_only, sycl::no_init};
+      cgh.parallel_for<kernel_name>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        int lid = it.get_local_id(0);
+        out[0] = all_of_group(g, pred(in[lid]));
+        out[1] = all_of_group(g, in[lid], pred);
+        out[2] = joint_all_of(g, in.get_pointer(), in.get_pointer() + N, pred);
+      });
+    });
+  }
+  bool expected = std::all_of(input.begin(), input.end(), pred);
+  assert(output[0] == expected);
+  assert(output[1] == expected);
+  assert(output[2] == expected);
+}
+
+int main() {
+  queue q;
+  if (!isSupportedDevice(q.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 128;
+  std::array<int, N> input;
+  std::array<bool, 3> output;
+  std::iota(input.begin(), input.end(), 0);
+  std::fill(output.begin(), output.end(), false);
+
+  test(q, input, output, IsEven());
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/SYCL/GroupAlgorithm/SYCL2020/any_of.cpp b/SYCL/GroupAlgorithm/SYCL2020/any_of.cpp
new file mode 100644
index 0000000000..9f681ec20c
--- /dev/null
+++ b/SYCL/GroupAlgorithm/SYCL2020/any_of.cpp
@@ -0,0 +1,76 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//
+// Missing __spirv_GroupAny on AMD:
+// XFAIL: rocm_amd
+
+#include "support.h"
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+using namespace sycl;
+
+template <class Predicate> class any_of_kernel;
+
+struct GeZero {
+  bool operator()(int i) const { return i >= 0; }
+};
+struct IsEven {
+  bool operator()(int i) const { return (i % 2) == 0; }
+};
+struct LtZero {
+  bool operator()(int i) const { return i < 0; }
+};
+
+template <typename InputContainer, typename OutputContainer, class Predicate>
+void test(queue q, InputContainer input, OutputContainer output,
+          Predicate pred) {
+  typedef typename InputContainer::value_type InputT;
+  typedef typename OutputContainer::value_type OutputT;
+  typedef class any_of_kernel<Predicate> kernel_name;
+  size_t N = input.size();
+  size_t G = 64;
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      accessor in{in_buf, cgh, sycl::read_only};
+      accessor out{out_buf, cgh, sycl::write_only, sycl::no_init};
+      cgh.parallel_for<kernel_name>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        int lid = it.get_local_id(0);
+        out[0] = any_of_group(g, pred(in[lid]));
+        out[1] = any_of_group(g, in[lid], pred);
+        out[2] = joint_any_of(g, in.get_pointer(), in.get_pointer() + N, pred);
+      });
+    });
+  }
+  bool expected = std::any_of(input.begin(), input.end(), pred);
+  assert(output[0] == expected);
+  assert(output[1] == expected);
+  assert(output[2] == expected);
+}
+
+int main() {
+  queue q;
+  if (!isSupportedDevice(q.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 128;
+  std::array<int, N> input;
+  std::array<bool, 3> output;
+  std::iota(input.begin(), input.end(), 0);
+  std::fill(output.begin(), output.end(), false);
+
+  test(q, input, output, GeZero());
+  test(q, input, output, IsEven());
+  test(q, input, output, LtZero());
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/SYCL/GroupAlgorithm/SYCL2020/exclusive_scan.cpp b/SYCL/GroupAlgorithm/SYCL2020/exclusive_scan.cpp
new file mode 100644
index 0000000000..d827d71ce6
--- /dev/null
+++ b/SYCL/GroupAlgorithm/SYCL2020/exclusive_scan.cpp
@@ -0,0 +1,168 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//
+// Missing __spirv_GroupIAdd, __spirv_GroupBroadcast, __spirv_GroupSMin and
+// __spirv_GroupSMax on AMD:
+// XFAIL: rocm_amd
+
+// TODO: enable compile+runtime checks for operations defined in SPIR-V 1.3.
+// That requires either adding a switch to clang (-spirv-max-version=1.3) or
+// raising the spirv version from 1.1. to 1.3 for spirv translator
+// unconditionally. Using operators specific for spirv 1.3 and higher with
+// -spirv-max-version=1.1 being set by default causes assert/check fails
+// in spirv translator.
+// RUNx: %clangxx -fsycl -fsycl-targets=%sycl_triple -DSPIRV_1_3 %s -I . -o \
+   %t13.out
+
+#include "support.h"
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <limits>
+#include <numeric>
+#include <vector>
+using namespace sycl;
+
+template <class SpecializationKernelName, int TestNumber>
+class exclusive_scan_kernel;
+
+// std::exclusive_scan isn't implemented yet, so use serial implementation
+// instead
+namespace emu {
+template <typename InputIterator, typename OutputIterator,
+          class BinaryOperation, typename T>
+OutputIterator exclusive_scan(InputIterator first, InputIterator last,
+                              OutputIterator result, T init,
+                              BinaryOperation binary_op) {
+  T partial = init;
+  for (InputIterator it = first; it != last; ++it) {
+    *(result++) = partial;
+    partial = binary_op(partial, *it);
+  }
+  return result;
+}
+} // namespace emu
+
+template <typename SpecializationKernelName, typename InputContainer,
+          typename OutputContainer, class BinaryOperation>
+void test(queue q, InputContainer input, OutputContainer output,
+          BinaryOperation binary_op,
+          typename OutputContainer::value_type identity) {
+  typedef typename InputContainer::value_type InputT;
+  typedef typename OutputContainer::value_type OutputT;
+  typedef class exclusive_scan_kernel<SpecializationKernelName, 0> kernel_name0;
+  typedef class exclusive_scan_kernel<SpecializationKernelName, 1> kernel_name1;
+  typedef class exclusive_scan_kernel<SpecializationKernelName, 2> kernel_name2;
+  typedef class exclusive_scan_kernel<SpecializationKernelName, 3> kernel_name3;
+  OutputT init = 42;
+  size_t N = input.size();
+  size_t G = 64;
+  std::vector<OutputT> expected(N);
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+      accessor in{in_buf, cgh, sycl::read_only};
+      accessor out{out_buf, cgh, sycl::write_only, sycl::no_init};
+      cgh.parallel_for<kernel_name0>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        int lid = it.get_local_id(0);
+        out[lid] = exclusive_scan_over_group(g, in[lid], binary_op);
+      });
+    });
+  }
+  emu::exclusive_scan(input.begin(), input.begin() + G, expected.begin(),
+                      identity, binary_op);
+  assert(std::equal(output.begin(), output.begin() + G, expected.begin()));
+
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+      accessor in{in_buf, cgh, sycl::read_only};
+      accessor out{out_buf, cgh, sycl::write_only, sycl::no_init};
+      cgh.parallel_for<kernel_name1>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        int lid = it.get_local_id(0);
+        out[lid] = exclusive_scan_over_group(g, in[lid], init, binary_op);
+      });
+    });
+  }
+  emu::exclusive_scan(input.begin(), input.begin() + G, expected.begin(), init,
+                      binary_op);
+  assert(std::equal(output.begin(), output.begin() + G, expected.begin()));
+
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+      accessor in{in_buf, cgh, sycl::read_only};
+      accessor out{out_buf, cgh, sycl::write_only, sycl::no_init};
+      cgh.parallel_for<kernel_name2>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        joint_exclusive_scan(g, in.get_pointer(), in.get_pointer() + N,
+                             out.get_pointer(), binary_op);
+      });
+    });
+  }
+  emu::exclusive_scan(input.begin(), input.begin() + N, expected.begin(),
+                      identity, binary_op);
+  assert(std::equal(output.begin(), output.begin() + N, expected.begin()));
+
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+      accessor in{in_buf, cgh, sycl::read_only};
+      accessor out{out_buf, cgh, sycl::write_only, sycl::no_init};
+      cgh.parallel_for<kernel_name3>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        joint_exclusive_scan(g, in.get_pointer(), in.get_pointer() + N,
+                             out.get_pointer(), init, binary_op);
+      });
+    });
+  }
+  emu::exclusive_scan(input.begin(), input.begin() + N, expected.begin(), init,
+                      binary_op);
+  assert(std::equal(output.begin(), output.begin() + N, expected.begin()));
+}
+
+int main() {
+  queue q;
+  if (!isSupportedDevice(q.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 128;
+  std::array<int, N> input;
+  std::array<int, N> output;
+  std::iota(input.begin(), input.end(), 0);
+  std::fill(output.begin(), output.end(), 0);
+
+  test<class KernelNamePlusV>(q, input, output, sycl::plus<>(), 0);
+  test<class KernelNameMinimumV>(q, input, output, sycl::minimum<>(),
+                                 std::numeric_limits<int>::max());
+  test<class KernelNameMaximumV>(q, input, output, sycl::maximum<>(),
+                                 std::numeric_limits<int>::lowest());
+
+  test<class KernelNamePlusI>(q, input, output, sycl::plus<int>(), 0);
+  test<class KernelNameMinimumI>(q, input, output, sycl::minimum<int>(),
+                                 std::numeric_limits<int>::max());
+  test<class KernelNameMaximumI>(q, input, output, sycl::maximum<int>(),
+                                 std::numeric_limits<int>::lowest());
+
+#ifdef SPIRV_1_3
+  test<class KernelName_VzAPutpBRRJrQPB>(q, input, output,
+                                         sycl::multiplies<int>(), 1);
+  test<class KernelName_UXdGbr>(q, input, output, sycl::bit_or<int>(), 0);
+  test<class KernelName_saYaodNyJknrPW>(q, input, output, sycl::bit_xor<int>(),
+                                        0);
+  test<class KernelName_GPcuAlvAOjrDyP>(q, input, output, sycl::bit_and<int>(),
+                                        ~0);
+#endif // SPIRV_1_3
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/SYCL/GroupAlgorithm/SYCL2020/group_broadcast.cpp b/SYCL/GroupAlgorithm/SYCL2020/group_broadcast.cpp
new file mode 100644
index 0000000000..dafec4e8eb
--- /dev/null
+++ b/SYCL/GroupAlgorithm/SYCL2020/group_broadcast.cpp
@@ -0,0 +1,101 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//
+// Missing __spirv_GroupBroadcast on AMD:
+// XFAIL: rocm_amd
+
+#include "support.h"
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <complex>
+#include <numeric>
+using namespace sycl;
+
+template <typename kernel_name, typename InputContainer,
+          typename OutputContainer>
+void test(queue q, InputContainer input, OutputContainer output) {
+  typedef typename InputContainer::value_type InputT;
+  typedef typename OutputContainer::value_type OutputT;
+  size_t N = input.size();
+  size_t G = 4;
+  range<2> R(G, G);
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      accessor in{in_buf, cgh, sycl::read_only};
+      accessor out{out_buf, cgh, sycl::write_only, sycl::no_init};
+      cgh.parallel_for<kernel_name>(nd_range<2>(R, R), [=](nd_item<2> it) {
+        group<2> g = it.get_group();
+        int lid = it.get_local_linear_id();
+        out[0] = group_broadcast(g, in[lid]);
+        out[1] = group_broadcast(g, in[lid], group<2>::id_type(1, 2));
+        out[2] =
+            group_broadcast(g, in[lid], group<2>::linear_id_type(2 * G + 1));
+      });
+    });
+  }
+  assert(output[0] == input[0]);
+  assert(output[1] == input[1 * G + 2]);
+  assert(output[2] == input[2 * G + 1]);
+}
+
+int main() {
+  queue q;
+  if (!isSupportedDevice(q.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 16;
+
+  // Test built-in scalar type
+  {
+    std::array<int, N> input;
+    std::array<int, 3> output;
+    std::iota(input.begin(), input.end(), 1);
+    std::fill(output.begin(), output.end(), false);
+    test<class KernelName_EFL>(q, input, output);
+  }
+
+  // Test pointer type
+  // {
+  //   std::array<int *, N> input;
+  //   std::array<int *, 3> output;
+  //   for (int i = 0; i < N; ++i) {
+  //     input[i] = static_cast<int *>(0x0) + i;
+  //   }
+  //   std::fill(output.begin(), output.end(), static_cast<int *>(0x0));
+  //   test<class KernelName_NrqELzFQToOSPsRNMi>(q, input, output);
+  // }
+
+  // Test user-defined type
+  // - Use complex as a proxy for this
+  // - Test float and double to test 64-bit and 128-bit types
+  // {
+  //   std::array<std::complex<float>, N> input;
+  //   std::array<std::complex<float>, 3> output;
+  //   for (int i = 0; i < N; ++i) {
+  //     input[i] =
+  //         std::complex<float>(0, 1) + (float)i * std::complex<float>(2, 2);
+  //   }
+  //   std::fill(output.begin(), output.end(), std::complex<float>(0, 0));
+  //   test<class KernelName_rCblcml>(q, input, output);
+  // }
+  // {
+  //   std::array<std::complex<double>, N> input;
+  //   std::array<std::complex<double>, 3> output;
+  //   for (int i = 0; i < N; ++i) {
+  //     input[i] =
+  //         std::complex<double>(0, 1) + (double)i * std::complex<double>(2,
+  //         2);
+  //   }
+  //   std::fill(output.begin(), output.end(), std::complex<float>(0, 0));
+  //   test<class KernelName_NCWhjnQ>(q, input, output);
+  // }
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/SYCL/GroupAlgorithm/SYCL2020/helpers.hpp b/SYCL/GroupAlgorithm/SYCL2020/helpers.hpp
new file mode 100644
index 0000000000..e4b74966a6
--- /dev/null
+++ b/SYCL/GroupAlgorithm/SYCL2020/helpers.hpp
@@ -0,0 +1,168 @@
+//==---------- helpers.hpp -*- C++ -*--------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include <CL/sycl.hpp>
+#include <bitset>
+#include <cmath>
+#include <complex>
+#include <iostream>
+
+using namespace cl::sycl;
+
+// ---- utils
+template <typename T1, int N> struct utils {
+  static T1 add_vec(const vec<T1, N> &v);
+  static bool cmp_vec(const vec<T1, N> &v, const vec<T1, N> &r);
+  static std::string stringify_vec(const vec<T1, N> &v);
+};
+template <typename T2> struct utils<T2, 1> {
+  static T2 add_vec(const vec<T2, 1> &v) { return v.s0(); }
+  static bool cmp_vec(const vec<T2, 1> &v, const vec<T2, 1> &r) {
+    return v.s0() == r.s0();
+  }
+  static std::string stringify_vec(const vec<T2, 1> &v) {
+    return std::to_string((T2)v.s0());
+  }
+};
+template <typename T2> struct utils<T2, 2> {
+  static T2 add_vec(const vec<T2, 2> &v) { return v.s0() + v.s1(); }
+  static bool cmp_vec(const vec<T2, 2> &v, const vec<T2, 2> &r) {
+    return v.s0() == r.s0() && v.s1() == r.s1();
+  }
+  static std::string stringify_vec(const vec<T2, 2> &v) {
+    return std::string("(") + std::to_string((T2)v.s0()) + ", " +
+           std::to_string((T2)v.s1()) + " )";
+  }
+};
+template <typename T2> struct utils<T2, 3> {
+  static T2 add_vec(const vec<T2, 3> &v) { return v.s0() + v.s1() + v.s2(); }
+  static bool cmp_vec(const vec<T2, 3> &v, const vec<T2, 3> &r) {
+    return v.s0() == r.s0() && v.s1() == r.s1() && v.s2() == r.s2();
+  }
+  static std::string stringify_vec(const vec<T2, 2> &v) {
+    return std::string("(") + std::to_string((T2)v.s0()) + ", " +
+           std::to_string((T2)v.s1()) + ", " + std::to_string((T2)v.s3()) +
+           " )";
+  }
+};
+template <typename T2> struct utils<T2, 4> {
+  static T2 add_vec(const vec<T2, 4> &v) {
+    return v.s0() + v.s1() + v.s2() + v.s3();
+  }
+  static bool cmp_vec(const vec<T2, 4> &v, const vec<T2, 4> &r) {
+    return v.s0() == r.s0() && v.s1() == r.s1() && v.s2() == r.s2() &&
+           v.s3() == r.s3();
+  }
+  static std::string stringify_vec(const vec<T2, 4> &v) {
+    return std::string("(") + std::to_string((T2)v.s0()) + ", " +
+           std::to_string((T2)v.s1()) + std::to_string((T2)v.s2()) + ", " +
+           std::to_string((T2)v.s3()) + " )";
+  }
+};
+template <typename T2> struct utils<T2, 8> {
+  static T2 add_vec(const vec<T2, 8> &v) {
+    return v.s0() + v.s1() + v.s2() + v.s3() + v.s4() + v.s5() + v.s6() +
+           v.s7();
+  }
+  static bool cmp_vec(const vec<T2, 8> &v, const vec<T2, 8> &r) {
+    return v.s0() == r.s0() && v.s1() == r.s1() && v.s2() == r.s2() &&
+           v.s3() == r.s3() && v.s4() == r.s4() && v.s5() == r.s5() &&
+           v.s6() == r.s6() && v.s7() == r.s7();
+  }
+  static std::string stringify_vec(const vec<T2, 8> &v) {
+    return std::string("(") + std::to_string((T2)v.s0()) + ", " +
+           std::to_string((T2)v.s1()) + std::to_string((T2)v.s2()) + ", " +
+           std::to_string((T2)v.s3()) + std::to_string((T2)v.s4()) + ", " +
+           std::to_string((T2)v.s5()) + std::to_string((T2)v.s6()) + ", " +
+           std::to_string((T2)v.s7()) + " )";
+  }
+};
+
+template <typename T2> struct utils<T2, 16> {
+  static T2 add_vec(const vec<T2, 16> &v) {
+    return v.s0() + v.s1() + v.s2() + v.s3() + v.s4() + v.s5() + v.s6() +
+           v.s7() + v.s8() + v.s9() + v.sA() + v.sB() + v.sC() + v.sD() +
+           v.sE() + v.sF();
+  }
+  static bool cmp_vec(const vec<T2, 16> &v, const vec<T2, 16> &r) {
+    return v.s0() == r.s0() && v.s1() == r.s1() && v.s2() == r.s2() &&
+           v.s3() == r.s3() && v.s4() == r.s4() && v.s5() == r.s5() &&
+           v.s6() == r.s6() && v.s7() == r.s7() && v.s8() == r.s8() &&
+           v.s9() == r.s9() && v.sA() == r.sA() && v.sB() == r.sB() &&
+           v.sC() == r.sC() && v.sD() == r.sD() && v.sE() == r.sE() &&
+           v.sF() == r.sF();
+  }
+  static std::string stringify_vec(const vec<T2, 16> &v) {
+    return std::string("(") + std::to_string((T2)v.s0()) + ", " +
+           std::to_string((T2)v.s1()) + std::to_string((T2)v.s2()) + ", " +
+           std::to_string((T2)v.s3()) + std::to_string((T2)v.s4()) + ", " +
+           std::to_string((T2)v.s5()) + std::to_string((T2)v.s6()) + ", " +
+           std::to_string((T2)v.s7()) + std::to_string((T2)v.s8()) + ", " +
+           std::to_string((T2)v.s9()) + std::to_string((T2)v.sA()) + ", " +
+           std::to_string((T2)v.sB()) + std::to_string((T2)v.sC()) + ", " +
+           std::to_string((T2)v.sE()) + std::to_string((T2)v.sD()) + ", " +
+           std::to_string((T2)v.sF()) + " )";
+  }
+};
+
+// ---- exit_if_not_equal
+template <typename T> void exit_if_not_equal(T val, T ref, const char *name) {
+  if (std::is_floating_point<T>::value) {
+    auto cmp_val = std::bitset<CHAR_BIT * sizeof(T)>(val);
+    auto cmp_ref = std::bitset<CHAR_BIT * sizeof(T)>(ref);
+    if (cmp_val != cmp_ref) {
+      std::cout << "Unexpected result for " << name << ": " << val << "("
+                << cmp_val << ") expected value: " << ref << "(" << cmp_ref
+                << ")" << std::endl;
+      exit(1);
+    }
+  } else {
+    if ((val - ref) != 0) {
+      std::cout << "Unexpected result for " << name << ": " << (long)val
+                << " expected value: " << (long)ref << std::endl;
+      exit(1);
+    }
+  }
+}
+
+// template <typename T>
+// void exit_if_not_equal(std::complex<T> val, std::complex<T> ref,
+//                        const char *name) {
+//   std::string Name{name};
+//   exit_if_not_equal(val.real(), ref.real(), (Name + ".real()").c_str());
+//   exit_if_not_equal(val.imag(), ref.imag(), (Name + ".imag()").c_str());
+// }
+
+template <typename T> void exit_if_not_equal(T *val, T *ref, const char *name) {
+  if ((val - ref) != 0) {
+    std::cout << "Unexpected result for " << name << ": " << val
+              << " expected value: " << ref << std::endl;
+    exit(1);
+  }
+}
+
+template <> void exit_if_not_equal(half val, half ref, const char *name) {
+  int16_t cmp_val = reinterpret_cast<int16_t &>(val);
+  int16_t cmp_ref = reinterpret_cast<int16_t &>(ref);
+  if (std::abs(cmp_val - cmp_ref) > 1) {
+    std::cout << "Unexpected result for " << name << ": " << (float)val
+              << " expected value: " << (float)ref << std::endl;
+    exit(1);
+  }
+}
+
+template <typename T, int N>
+void exit_if_not_equal_vec(vec<T, N> val, vec<T, N> ref, const char *name) {
+  if (!utils<T, N>::cmp_vec(ref, val)) {
+    std::cout << "Unexpected result for " << name << ": "
+              << utils<T, N>::stringify_vec(val)
+              << " expected value: " << utils<T, N>::stringify_vec(ref)
+              << std::endl;
+
+    exit(1);
+  }
+}
\ No newline at end of file
diff --git a/SYCL/GroupAlgorithm/SYCL2020/inclusive_scan.cpp b/SYCL/GroupAlgorithm/SYCL2020/inclusive_scan.cpp
new file mode 100644
index 0000000000..f2b9f7ce53
--- /dev/null
+++ b/SYCL/GroupAlgorithm/SYCL2020/inclusive_scan.cpp
@@ -0,0 +1,169 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//
+// Missing __spirv_GroupIAdd, __spirv_GroupBroadcast, __spirv_GroupSMin and
+// __spirv_GroupSMax on AMD:
+// XFAIL: rocm_amd
+
+// TODO: enable compile+runtime checks for operations defined in SPIR-V 1.3.
+// That requires either adding a switch to clang (-spirv-max-version=1.3) or
+// raising the spirv version from 1.1. to 1.3 for spirv translator
+// unconditionally. Using operators specific for spirv 1.3 and higher with
+// -spirv-max-version=1.1 being set by default causes assert/check fails
+// in spirv translator.
+// RUNx: %clangxx -fsycl -fsycl-targets=%sycl_triple -DSPIRV_1_3 %s -I . -o \
+   %t13.out
+
+#include "support.h"
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <limits>
+#include <numeric>
+#include <vector>
+using namespace sycl;
+
+template <class SpecializationKernelName, int TestNumber>
+class inclusive_scan_kernel;
+
+// std::inclusive_scan isn't implemented yet, so use serial implementation
+// instead
+namespace emu {
+template <typename InputIterator, typename OutputIterator,
+          class BinaryOperation, typename T>
+OutputIterator inclusive_scan(InputIterator first, InputIterator last,
+                              OutputIterator result, BinaryOperation binary_op,
+                              T init) {
+  T partial = init;
+  for (InputIterator it = first; it != last; ++it) {
+    partial = binary_op(partial, *it);
+    *(result++) = partial;
+  }
+  return result;
+}
+} // namespace emu
+
+template <typename SpecializationKernelName, typename InputContainer,
+          typename OutputContainer, class BinaryOperation>
+void test(queue q, InputContainer input, OutputContainer output,
+          BinaryOperation binary_op,
+          typename OutputContainer::value_type identity) {
+  typedef typename InputContainer::value_type InputT;
+  typedef typename OutputContainer::value_type OutputT;
+  typedef class inclusive_scan_kernel<SpecializationKernelName, 0> kernel_name0;
+  typedef class inclusive_scan_kernel<SpecializationKernelName, 1> kernel_name1;
+  typedef class inclusive_scan_kernel<SpecializationKernelName, 2> kernel_name2;
+  typedef class inclusive_scan_kernel<SpecializationKernelName, 3> kernel_name3;
+  OutputT init = 42;
+  size_t N = input.size();
+  size_t G = 64;
+  std::vector<OutputT> expected(N);
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+      accessor in{in_buf, cgh, sycl::read_only};
+      accessor out{out_buf, cgh, sycl::write_only, sycl::no_init};
+      cgh.parallel_for<kernel_name0>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        int lid = it.get_local_id(0);
+        out[lid] = inclusive_scan_over_group(g, in[lid], binary_op);
+      });
+    });
+  }
+  emu::inclusive_scan(input.begin(), input.begin() + G, expected.begin(),
+                      binary_op, identity);
+  assert(std::equal(output.begin(), output.begin() + G, expected.begin()));
+
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+      accessor in{in_buf, cgh, sycl::read_only};
+      accessor out{out_buf, cgh, sycl::write_only, sycl::no_init};
+      cgh.parallel_for<kernel_name1>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        int lid = it.get_local_id(0);
+        out[lid] = inclusive_scan_over_group(g, in[lid], binary_op, init);
+      });
+    });
+  }
+  emu::inclusive_scan(input.begin(), input.begin() + G, expected.begin(),
+                      binary_op, init);
+  assert(std::equal(output.begin(), output.begin() + G, expected.begin()));
+
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+      accessor in{in_buf, cgh, sycl::read_only};
+      accessor out{out_buf, cgh, sycl::write_only, sycl::no_init};
+      cgh.parallel_for<kernel_name2>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        joint_inclusive_scan(g, in.get_pointer(), in.get_pointer() + N,
+                             out.get_pointer(), binary_op);
+      });
+    });
+  }
+  emu::inclusive_scan(input.begin(), input.begin() + N, expected.begin(),
+                      binary_op, identity);
+  assert(std::equal(output.begin(), output.begin() + N, expected.begin()));
+
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+      accessor in{in_buf, cgh, sycl::read_only};
+      accessor out{out_buf, cgh, sycl::write_only, sycl::no_init};
+      cgh.parallel_for<kernel_name3>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        joint_inclusive_scan(g, in.get_pointer(), in.get_pointer() + N,
+                             out.get_pointer(), binary_op, init);
+      });
+    });
+  }
+  emu::inclusive_scan(input.begin(), input.begin() + N, expected.begin(),
+                      binary_op, init);
+  assert(std::equal(output.begin(), output.begin() + N, expected.begin()));
+}
+
+int main() {
+  queue q;
+  if (!isSupportedDevice(q.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 128;
+  std::array<int, N> input;
+  std::array<int, N> output;
+  std::iota(input.begin(), input.end(), 0);
+  std::fill(output.begin(), output.end(), 0);
+
+  test<class KernelNamePlusV>(q, input, output, sycl::plus<>(), 0);
+  test<class KernelNameMinimumV>(q, input, output, sycl::minimum<>(),
+                                 std::numeric_limits<int>::max());
+  test<class KernelNameMaximumV>(q, input, output, sycl::maximum<>(),
+                                 std::numeric_limits<int>::lowest());
+
+  test<class KernelNamePlusI>(q, input, output, sycl::plus<int>(), 0);
+  test<class KernelNameMinimumI>(q, input, output, sycl::minimum<int>(),
+                                 std::numeric_limits<int>::max());
+  test<class KernelNameMaximumI>(q, input, output, sycl::maximum<int>(),
+                                 std::numeric_limits<int>::lowest());
+
+#ifdef SPIRV_1_3
+  test<class KernelName_zMyjxUrBgeUGoxmDwhvJ>(q, input, output,
+                                              sycl::multiplies<int>(), 1);
+  test<class KernelName_SljjtroxNRaAXoVnT>(q, input, output,
+                                           sycl::bit_or<int>(), 0);
+  test<class KernelName_yXIZfjwjxQGiPeQAnc>(q, input, output,
+                                            sycl::bit_xor<int>(), 0);
+  test<class KernelName_xGnAnMYHvqekCk>(q, input, output, sycl::bit_and<int>(),
+                                        ~0);
+#endif // SPIRV_1_3
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/SYCL/GroupAlgorithm/SYCL2020/none_of.cpp b/SYCL/GroupAlgorithm/SYCL2020/none_of.cpp
new file mode 100644
index 0000000000..b2b0eb0d7d
--- /dev/null
+++ b/SYCL/GroupAlgorithm/SYCL2020/none_of.cpp
@@ -0,0 +1,74 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//
+// Missing __spirv_GroupAll and __spirv_GroupAny on AMD:
+// XFAIL: rocm_amd
+
+#include "support.h"
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+using namespace sycl;
+
+template <class Predicate> class none_of_kernel;
+
+struct GeZero {
+  bool operator()(int i) const { return i >= 0; }
+};
+struct IsEven {
+  bool operator()(int i) const { return (i % 2) == 0; }
+};
+struct LtZero {
+  bool operator()(int i) const { return i < 0; }
+};
+
+template <typename InputContainer, typename OutputContainer, class Predicate>
+void test(queue q, InputContainer input, OutputContainer output,
+          Predicate pred) {
+  typedef class none_of_kernel<Predicate> kernel_name;
+  size_t N = input.size();
+  size_t G = 64;
+  {
+    buffer<int> in_buf(input.data(), input.size());
+    buffer<bool> out_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      accessor in{in_buf, cgh, sycl::read_only};
+      accessor out{out_buf, cgh, sycl::write_only, sycl::no_init};
+      cgh.parallel_for<kernel_name>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        int lid = it.get_local_id(0);
+        out[0] = none_of_group(g, pred(in[lid]));
+        out[1] = none_of_group(g, in[lid], pred);
+        out[2] = joint_none_of(g, in.get_pointer(), in.get_pointer() + N, pred);
+      });
+    });
+  }
+  bool expected = std::none_of(input.begin(), input.end(), pred);
+  assert(output[0] == expected);
+  assert(output[1] == expected);
+  assert(output[2] == expected);
+}
+
+int main() {
+  queue q;
+  if (!isSupportedDevice(q.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 128;
+  std::array<int, N> input;
+  std::array<bool, 3> output;
+  std::iota(input.begin(), input.end(), 0);
+  std::fill(output.begin(), output.end(), false);
+
+  test(q, input, output, GeZero());
+  test(q, input, output, IsEven());
+  test(q, input, output, LtZero());
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/SYCL/GroupAlgorithm/SYCL2020/permute_select.cpp b/SYCL/GroupAlgorithm/SYCL2020/permute_select.cpp
new file mode 100644
index 0000000000..a863942294
--- /dev/null
+++ b/SYCL/GroupAlgorithm/SYCL2020/permute_select.cpp
@@ -0,0 +1,43 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//
+// Missing __spirv_SubgroupId, __spirv_SubgroupMaxSize, __spirv_SubgroupShuffle*
+// on AMD:
+// XFAIL: rocm_amd
+//
+//==------------ permute_select.cpp -*- C++ -*-----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "permute_select.hpp"
+
+int main() {
+  queue Queue;
+  if (Queue.get_device().is_host()) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  check<short>(Queue);
+  check<unsigned short>(Queue);
+  check<int>(Queue);
+  check<int, 2>(Queue);
+  check<int, 4>(Queue);
+  check<int, 8>(Queue);
+  check<int, 16>(Queue);
+  check<unsigned int>(Queue);
+  check<unsigned int, 2>(Queue);
+  check<unsigned int, 4>(Queue);
+  check<unsigned int, 8>(Queue);
+  check<unsigned int, 16>(Queue);
+  check<long>(Queue);
+  check<unsigned long>(Queue);
+  check<float>(Queue);
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
diff --git a/SYCL/GroupAlgorithm/SYCL2020/permute_select.hpp b/SYCL/GroupAlgorithm/SYCL2020/permute_select.hpp
new file mode 100644
index 0000000000..cfabdd9e22
--- /dev/null
+++ b/SYCL/GroupAlgorithm/SYCL2020/permute_select.hpp
@@ -0,0 +1,140 @@
+//==----- permute_select.hpp -*- C++ -*------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "helpers.hpp"
+#include <CL/sycl.hpp>
+template <typename T, int N> class sycl_subgr;
+
+using namespace cl::sycl;
+
+// TODO remove this workaround when clang will support correct generation of
+// half typename in integration header
+struct wa_half;
+
+template <typename T, int N>
+void check(queue &Queue, size_t G = 256, size_t L = 64) {
+  try {
+    nd_range<1> NdRange(G, L);
+    buffer<vec<T, N>> buf_select(G);
+    buffer<vec<T, N>> buf_xor(G);
+    buffer<size_t> sgsizebuf(1);
+    Queue.submit([&](handler &cgh) {
+      accessor acc_select{buf_select, cgh, sycl::read_write};
+      accessor acc_xor{buf_xor, cgh, sycl::read_write};
+      accessor sgsizeacc{sgsizebuf, cgh, sycl::read_write};
+      cgh.parallel_for<sycl_subgr<T, N>>(NdRange, [=](nd_item<1> NdItem) {
+        ext::oneapi::sub_group SG = NdItem.get_sub_group();
+        uint32_t wggid = NdItem.get_global_id(0);
+        uint32_t sgid = SG.get_group_id().get(0);
+        vec<T, N> vwggid(wggid), vsgid(sgid);
+        if (wggid == 0)
+          sgsizeacc[0] = SG.get_max_local_range()[0];
+
+        /*GID of middle element in every subgroup*/
+        acc_select[NdItem.get_global_id()] =
+            select_from_group(SG, vwggid, SG.get_max_local_range()[0] / 2);
+        /* Save GID with SGLID = ( SGLID XOR SGID ) % SGMaxSize */
+        acc_xor[NdItem.get_global_id()] = permute_group_by_xor(
+            SG, vwggid, sgid % SG.get_max_local_range()[0]);
+      });
+    });
+    host_accessor acc_select{buf_select, sycl::read_write};
+    host_accessor acc_xor{buf_xor, sycl::read_write};
+    host_accessor sgsizeacc{sgsizebuf, sycl::read_write};
+
+    size_t sg_size = sgsizeacc[0];
+    int SGid = 0;
+    int SGLid = 0;
+    int SGBeginGid = 0;
+    for (int j = 0; j < G; j++) {
+      if (j % L % sg_size == 0) {
+        SGid++;
+        SGLid = 0;
+        SGBeginGid = j;
+      }
+      if (j % L == 0) {
+        SGid = 0;
+        SGLid = 0;
+        SGBeginGid = j;
+      }
+      /*GID of middle element in every subgroup*/
+      exit_if_not_equal_vec<T, N>(
+          acc_select[j], vec<T, N>(j / L * L + SGid * sg_size + sg_size / 2),
+          "select_from_group");
+      /* Value GID with SGLID = ( SGLID XOR SGID ) % SGMaxSize */
+      exit_if_not_equal_vec(acc_xor[j],
+                            vec<T, N>(SGBeginGid + (SGLid ^ (SGid % sg_size))),
+                            "permute_group_by_xor");
+      SGLid++;
+    }
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what();
+    exit(1);
+  }
+}
+
+template <typename T> void check(queue &Queue, size_t G = 256, size_t L = 64) {
+  try {
+    nd_range<1> NdRange(G, L);
+    buffer<T> buf_select(G);
+    buffer<T> buf_xor(G);
+    buffer<size_t> sgsizebuf(1);
+    Queue.submit([&](handler &cgh) {
+      accessor acc_select{buf_select, cgh, sycl::read_write};
+      accessor acc_xor{buf_xor, cgh, sycl::read_write};
+      accessor sgsizeacc{sgsizebuf, cgh, sycl::read_write};
+      cgh.parallel_for<sycl_subgr<T, 0>>(NdRange, [=](nd_item<1> NdItem) {
+        ext::oneapi::sub_group SG = NdItem.get_sub_group();
+        uint32_t wggid = NdItem.get_global_id(0);
+        uint32_t sgid = SG.get_group_id().get(0);
+        if (wggid == 0)
+          sgsizeacc[0] = SG.get_max_local_range()[0];
+
+        /*GID of middle element in every subgroup*/
+        acc_select[NdItem.get_global_id()] =
+            select_from_group(SG, wggid, SG.get_max_local_range()[0] / 2);
+        /* Save GID with SGLID = ( SGLID XOR SGID ) % SGMaxSize */
+        acc_xor[NdItem.get_global_id()] =
+            permute_group_by_xor(SG, wggid, sgid % SG.get_max_local_range()[0]);
+      });
+    });
+    host_accessor acc_select{buf_select, sycl::read_write};
+    host_accessor acc_xor{buf_xor, sycl::read_write};
+    host_accessor sgsizeacc{sgsizebuf, sycl::read_write};
+
+    size_t sg_size = sgsizeacc[0];
+    int SGid = 0;
+    int SGLid = 0;
+    int SGBeginGid = 0;
+    for (int j = 0; j < G; j++) {
+      if (j % L % sg_size == 0) {
+        SGid++;
+        SGLid = 0;
+        SGBeginGid = j;
+      }
+      if (j % L == 0) {
+        SGid = 0;
+        SGLid = 0;
+        SGBeginGid = j;
+      }
+
+      /*GID of middle element in every subgroup*/
+      exit_if_not_equal<T>(acc_select[j],
+                           j / L * L + SGid * sg_size + sg_size / 2,
+                           "select_from_group");
+
+      /* Value GID with SGLID = ( SGLID XOR SGID ) % SGMaxSize */
+      exit_if_not_equal<T>(acc_xor[j], SGBeginGid + (SGLid ^ (SGid % sg_size)),
+                           "permute_group_by_xor");
+      SGLid++;
+    }
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what();
+    exit(1);
+  }
+}
diff --git a/SYCL/GroupAlgorithm/SYCL2020/reduce.cpp b/SYCL/GroupAlgorithm/SYCL2020/reduce.cpp
new file mode 100644
index 0000000000..2c968ce04f
--- /dev/null
+++ b/SYCL/GroupAlgorithm/SYCL2020/reduce.cpp
@@ -0,0 +1,103 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//
+// Missinsg __spirv_GroupIAdd, __spirv_GroupSMin and __spirv_GroupSMax on AMD:
+// XFAIL: rocm_amd
+
+// TODO: enable compile+runtime checks for operations defined in SPIR-V 1.3.
+// That requires either adding a switch to clang (-spirv-max-version=1.3) or
+// raising the spirv version from 1.1. to 1.3 for spirv translator
+// unconditionally. Using operators specific for spirv 1.3 and higher with
+// -spirv-max-version=1.1 being set by default causes assert/check fails
+// in spirv translator.
+// RUNx: %clangxx -fsycl -fsycl-targets=%sycl_triple -DSPIRV_1_3 %s -I . -o \
+   %t13.out
+
+#include "support.h"
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <limits>
+#include <numeric>
+using namespace sycl;
+
+template <typename SpecializationKernelName, typename InputContainer,
+          typename OutputContainer, class BinaryOperation>
+void test(queue q, InputContainer input, OutputContainer output,
+          BinaryOperation binary_op,
+          typename OutputContainer::value_type identity) {
+  typedef typename InputContainer::value_type InputT;
+  typedef typename OutputContainer::value_type OutputT;
+  OutputT init = 42;
+  size_t N = input.size();
+  size_t G = 64;
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      accessor in{in_buf, cgh, sycl::read_only};
+      accessor out{out_buf, cgh, sycl::write_only, sycl::no_init};
+      cgh.parallel_for<SpecializationKernelName>(
+          nd_range<1>(G, G), [=](nd_item<1> it) {
+            group<1> g = it.get_group();
+            int lid = it.get_local_id(0);
+            out[0] = reduce_over_group(g, in[lid], binary_op);
+            out[1] = reduce_over_group(g, in[lid], init, binary_op);
+            out[2] = joint_reduce(g, in.get_pointer(), in.get_pointer() + N,
+                                  binary_op);
+            out[3] = joint_reduce(g, in.get_pointer(), in.get_pointer() + N,
+                                  init, binary_op);
+          });
+    });
+  }
+  // std::reduce is not implemented yet, so use std::accumulate instead
+  assert(output[0] == std::accumulate(input.begin(), input.begin() + G,
+                                      identity, binary_op));
+  assert(output[1] ==
+         std::accumulate(input.begin(), input.begin() + G, init, binary_op));
+  assert(output[2] ==
+         std::accumulate(input.begin(), input.end(), identity, binary_op));
+  assert(output[3] ==
+         std::accumulate(input.begin(), input.end(), init, binary_op));
+}
+
+int main() {
+  queue q;
+  if (!isSupportedDevice(q.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 128;
+  std::array<int, N> input;
+  std::array<int, 4> output;
+  std::iota(input.begin(), input.end(), 0);
+  std::fill(output.begin(), output.end(), 0);
+
+  test<class KernelNamePlusV>(q, input, output, sycl::plus<>(), 0);
+  test<class KernelNameMinimumV>(q, input, output, sycl::minimum<>(),
+                                 std::numeric_limits<int>::max());
+  test<class KernelNameMaximumV>(q, input, output, sycl::maximum<>(),
+                                 std::numeric_limits<int>::lowest());
+
+  test<class KernelNamePlusI>(q, input, output, sycl::plus<int>(), 0);
+  test<class KernelNameMinimumI>(q, input, output, sycl::minimum<int>(),
+                                 std::numeric_limits<int>::max());
+  test<class KernelNameMaximumI>(q, input, output, sycl::maximum<int>(),
+                                 std::numeric_limits<int>::lowest());
+
+#ifdef SPIRV_1_3
+  test<class KernelName_WonwuUVPUPOTKRKIBtT>(q, input, output,
+                                             sycl::multiplies<int>(), 1);
+  test<class KernelName_qYBaJDZTMGkdIwD>(q, input, output, sycl::bit_or<int>(),
+                                         0);
+  test<class KernelName_eLSFt>(q, input, output, sycl::bit_xor<int>(), 0);
+  test<class KernelName_uFhJnxSVhNAiFPTG>(q, input, output,
+                                          sycl::bit_and<int>(), ~0);
+#endif // SPIRV_1_3
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/SYCL/GroupAlgorithm/SYCL2020/shift_left_right.cpp b/SYCL/GroupAlgorithm/SYCL2020/shift_left_right.cpp
new file mode 100644
index 0000000000..790eb5030d
--- /dev/null
+++ b/SYCL/GroupAlgorithm/SYCL2020/shift_left_right.cpp
@@ -0,0 +1,43 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//
+// Missing __spirv_SubgroupId, __spirv_SubgroupMaxSize, __spirv_SubgroupShuffle*
+// on AMD:
+// XFAIL: rocm_amd
+//
+//==------------ shift_left_right.cpp -*- C++ -*----------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "shift_left_right.hpp"
+
+int main() {
+  queue Queue;
+  if (Queue.get_device().is_host()) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  check<short>(Queue);
+  check<unsigned short>(Queue);
+  check<int>(Queue);
+  check<int, 2>(Queue);
+  check<int, 4>(Queue);
+  check<int, 8>(Queue);
+  check<int, 16>(Queue);
+  check<unsigned int>(Queue);
+  check<unsigned int, 2>(Queue);
+  check<unsigned int, 4>(Queue);
+  check<unsigned int, 8>(Queue);
+  check<unsigned int, 16>(Queue);
+  check<long>(Queue);
+  check<unsigned long>(Queue);
+  check<float>(Queue);
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
diff --git a/SYCL/GroupAlgorithm/SYCL2020/shift_left_right.hpp b/SYCL/GroupAlgorithm/SYCL2020/shift_left_right.hpp
new file mode 100644
index 0000000000..bad1136fc8
--- /dev/null
+++ b/SYCL/GroupAlgorithm/SYCL2020/shift_left_right.hpp
@@ -0,0 +1,143 @@
+//==------- shift_left_right.hpp -*- C++ -*---------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "helpers.hpp"
+#include <CL/sycl.hpp>
+template <typename T, int N> class sycl_subgr;
+
+using namespace cl::sycl;
+
+// TODO remove this workaround when clang will support correct generation of
+// half typename in integration header
+struct wa_half;
+
+// ---- check
+template <typename T, int N>
+void check(queue &Queue, size_t G = 256, size_t L = 64) {
+  try {
+    nd_range<1> NdRange(G, L);
+    buffer<vec<T, N>> buf_right(G);
+    buffer<vec<T, N>> buf_left(G);
+    buffer<size_t> sgsizebuf(1);
+    Queue.submit([&](handler &cgh) {
+      accessor acc_right{buf_right, cgh, sycl::read_write};
+      accessor acc_left{buf_left, cgh, sycl::read_write};
+      accessor sgsizeacc{sgsizebuf, cgh, sycl::read_write};
+
+      cgh.parallel_for<sycl_subgr<T, N>>(NdRange, [=](nd_item<1> NdItem) {
+        ext::oneapi::sub_group SG = NdItem.get_sub_group();
+        uint32_t wggid = NdItem.get_global_id(0);
+        uint32_t sgid = SG.get_group_id().get(0);
+        vec<T, N> vwggid(wggid), vsgid(sgid);
+        if (wggid == 0)
+          sgsizeacc[0] = SG.get_max_local_range()[0];
+
+        /* Save GID-SGID */
+        acc_right[NdItem.get_global_id()] = shift_group_right(SG, vwggid, sgid);
+        /* Save GID+SGID */
+        acc_left[NdItem.get_global_id()] = shift_group_left(SG, vwggid, sgid);
+      });
+    });
+    host_accessor acc_right{buf_right, sycl::read_write};
+    host_accessor acc_left{buf_left, sycl::read_write};
+    host_accessor sgsizeacc{sgsizebuf, sycl::read_write};
+
+    size_t sg_size = sgsizeacc[0];
+    int SGid = 0;
+    int SGLid = 0;
+    int SGBeginGid = 0;
+    for (int j = 0; j < G; j++) {
+      if (j % L % sg_size == 0) {
+        SGid++;
+        SGLid = 0;
+        SGBeginGid = j;
+      }
+      if (j % L == 0) {
+        SGid = 0;
+        SGLid = 0;
+        SGBeginGid = j;
+      }
+
+      /* Value GID+SGID for all element except last SGID in SG*/
+      if (j % L % sg_size + SGid < sg_size && j % L + SGid < L) {
+        exit_if_not_equal_vec(acc_left[j], vec<T, N>(j + SGid % sg_size),
+                              "shift_group_left");
+      }
+      /* Value GID-SGID for all element except first SGID in SG*/
+      if (j % L % sg_size >= SGid) {
+        exit_if_not_equal_vec(acc_right[j], vec<T, N>(j - SGid % sg_size),
+                              "shift_group_right");
+      }
+
+      SGLid++;
+    }
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what();
+    exit(1);
+  }
+}
+
+template <typename T> void check(queue &Queue, size_t G = 256, size_t L = 64) {
+  try {
+    nd_range<1> NdRange(G, L);
+    buffer<T> buf_right(G);
+    buffer<T> buf_left(G);
+    buffer<size_t> sgsizebuf(1);
+    Queue.submit([&](handler &cgh) {
+      accessor acc_right{buf_right, cgh, sycl::read_write};
+      accessor acc_left{buf_left, cgh, sycl::read_write};
+      accessor sgsizeacc{sgsizebuf, cgh, sycl::read_write};
+      cgh.parallel_for<sycl_subgr<T, 0>>(NdRange, [=](nd_item<1> NdItem) {
+        ext::oneapi::sub_group SG = NdItem.get_sub_group();
+        uint32_t wggid = NdItem.get_global_id(0);
+        uint32_t sgid = SG.get_group_id().get(0);
+        if (wggid == 0)
+          sgsizeacc[0] = SG.get_max_local_range()[0];
+
+        /* Save GID-SGID */
+        acc_right[NdItem.get_global_id()] = shift_group_right(SG, wggid, sgid);
+        /* Save GID+SGID */
+        acc_left[NdItem.get_global_id()] = shift_group_left(SG, wggid, sgid);
+      });
+    });
+    host_accessor acc_right{buf_right, sycl::read_write};
+    host_accessor acc_left{buf_left, sycl::read_write};
+    host_accessor sgsizeacc{sgsizebuf, sycl::read_write};
+
+    size_t sg_size = sgsizeacc[0];
+    int SGid = 0;
+    int SGLid = 0;
+    int SGBeginGid = 0;
+    for (int j = 0; j < G; j++) {
+      if (j % L % sg_size == 0) {
+        SGid++;
+        SGLid = 0;
+        SGBeginGid = j;
+      }
+      if (j % L == 0) {
+        SGid = 0;
+        SGLid = 0;
+        SGBeginGid = j;
+      }
+
+      /* Value GID+SGID for all element except last SGID in SG*/
+      if (j % L % sg_size + SGid < sg_size && j % L + SGid < L) {
+        exit_if_not_equal<T>(acc_left[j], j + SGid, "shift_group_left");
+      }
+      /* Value GID-SGID for all element except first SGID in SG*/
+      if (j % L % sg_size >= SGid) {
+        exit_if_not_equal<T>(acc_right[j], j - SGid, "shift_group_right");
+      }
+
+      SGLid++;
+    }
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what();
+    exit(1);
+  }
+}
diff --git a/SYCL/GroupAlgorithm/SYCL2020/support.h b/SYCL/GroupAlgorithm/SYCL2020/support.h
new file mode 100644
index 0000000000..fa5118ea27
--- /dev/null
+++ b/SYCL/GroupAlgorithm/SYCL2020/support.h
@@ -0,0 +1,23 @@
+#include <CL/sycl.hpp>
+using namespace sycl;
+
+bool isSupportedDevice(device D) {
+  std::string PlatformName = D.get_platform().get_info<info::platform::name>();
+  if (PlatformName.find("CUDA") != std::string::npos)
+    return true;
+
+  if (PlatformName.find("Level-Zero") != std::string::npos)
+    return true;
+
+  if (PlatformName.find("OpenCL") != std::string::npos) {
+    std::string Version = D.get_info<info::device::version>();
+    size_t Offset = Version.find("OpenCL");
+    if (Offset == std::string::npos)
+      return false;
+    Version = Version.substr(Offset + 7, 3);
+    if (Version >= std::string("2.0"))
+      return true;
+  }
+
+  return false;
+}
diff --git a/SYCL/GroupAlgorithm/back_to_back_collectives.cpp b/SYCL/GroupAlgorithm/back_to_back_collectives.cpp
new file mode 100644
index 0000000000..5c5d14fd33
--- /dev/null
+++ b/SYCL/GroupAlgorithm/back_to_back_collectives.cpp
@@ -0,0 +1,73 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//
+// Missing __spirv_GroupIAdd on AMD:
+// XFAIL: rocm_amd
+
+#include <CL/sycl.hpp>
+#include <numeric>
+#include <vector>
+using namespace cl::sycl;
+using namespace cl::sycl::ext::oneapi;
+
+class back_to_back;
+
+int main() {
+  queue q;
+  if (q.get_device().is_host()) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  // Use max work-group size to maximize chance of race
+  program prog(q.get_context());
+  prog.build_with_kernel_type<back_to_back>();
+  kernel k = prog.get_kernel<back_to_back>();
+  device d = q.get_device();
+  int N = k.get_info<info::kernel_device_specific::work_group_size>(d);
+
+  std::vector<int> Input(N), Sum(N), EScan(N), IScan(N);
+  std::iota(Input.begin(), Input.end(), 0);
+  std::fill(Sum.begin(), Sum.end(), 0);
+  std::fill(EScan.begin(), EScan.end(), 0);
+  std::fill(IScan.begin(), IScan.end(), 0);
+
+  {
+    buffer<int> InputBuf(Input.data(), N);
+    buffer<int> SumBuf(Sum.data(), N);
+    buffer<int> EScanBuf(EScan.data(), N);
+    buffer<int> IScanBuf(IScan.data(), N);
+    q.submit([&](handler &h) {
+      auto Input = InputBuf.get_access<access::mode::read>(h);
+      auto Sum = SumBuf.get_access<access::mode::write>(h);
+      auto EScan = EScanBuf.get_access<access::mode::write>(h);
+      auto IScan = IScanBuf.get_access<access::mode::write>(h);
+      h.parallel_for<back_to_back>(nd_range<1>(N, N), [=](nd_item<1> it) {
+        size_t i = it.get_global_id(0);
+        auto g = it.get_group();
+        // Loop to increase number of back-to-back calls
+        for (int r = 0; r < 10; ++r) {
+          Sum[i] = reduce(g, Input[i], sycl::plus<>());
+          EScan[i] = exclusive_scan(g, Input[i], sycl::plus<>());
+          IScan[i] = inclusive_scan(g, Input[i], sycl::plus<>());
+        }
+      });
+    });
+  }
+
+  int sum = 0;
+  bool passed = true;
+  for (int i = 0; i < N; ++i) {
+    passed &= (sum == EScan[i]);
+    sum += i;
+    passed &= (sum == IScan[i]);
+  }
+  for (int i = 0; i < N; ++i) {
+    passed &= (sum == Sum[i]);
+  }
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
diff --git a/SYCL/Plugin/level_zero_batch_event_status.cpp b/SYCL/Plugin/level_zero_batch_event_status.cpp
new file mode 100644
index 0000000000..d98140ac4b
--- /dev/null
+++ b/SYCL/Plugin/level_zero_batch_event_status.cpp
@@ -0,0 +1,108 @@
+// REQUIRES: gpu, level_zero
+
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple  %s -o %t.out
+
+// Set batching to 4 explicitly
+// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=4 SYCL_PI_TRACE=2 ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 | FileCheck %s
+
+// level_zero_batch_test.cpp
+//
+// This tests the level zero plugin's kernel batching code.  It specifically
+// tests that the current batch is submitted when an Event execution status
+// request is made.  This test uses explicit SYCL_PI_LEVEL_ZERO_BATCH_SIZE=4
+// to make sure that the batching is submitted when the piEventGetInfo is
+// done, rather than some other dynamic batching criteria.
+//
+// CHECK: ---> piEnqueueKernelLaunch
+// CHECK: ZE ---> zeCommandListAppendLaunchKernel
+// Shouldn't have closed until we see a piEventGetInfo
+// CHECK-NOT:  ZE ---> zeCommandListClose
+// CHECK-NOT:  ZE ---> zeCommandQueueExecuteCommandLists
+// CHECK: ---> piEventGetInfo
+// Shouldn't see another piGetEventInfo until after closing command list
+// CHECK-NOT: ---> piEventGetInfo
+// Look for close and Execute after piEventGetInfo
+// CHECK:  ZE ---> zeCommandListClose
+// CHECK:  ZE ---> zeCommandQueueExecuteCommandLists
+// CHECK: ---> piEventGetInfo
+// CHECK-NOT: piEventsWait
+// CHECK: ---> piEnqueueKernelLaunch
+// CHECK: ZE ---> zeCommandListAppendLaunchKernel
+// CHECK: ---> piEventsWait
+// Look for close and Execute after piEventsWait
+// CHECK:  ZE ---> zeCommandListClose
+// CHECK:  ZE ---> zeCommandQueueExecuteCommandLists
+// CHECK: ---> piEventGetInfo
+// No close and execute here, should already have happened.
+// CHECK-NOT:  ZE ---> zeCommandListClose
+// CHECK-NOT:  ZE ---> zeCommandQueueExecuteCommandLists
+// CHECK-NOT: Test Fail
+// CHECK: Test Pass
+
+#include <cassert>
+#include <chrono>
+#include <iostream>
+#include <sycl/sycl.hpp>
+#include <thread>
+
+int main(void) {
+  sycl::default_selector ds{};
+  sycl::queue q{ds};
+  sycl::vector_class<sycl::event> events(10);
+
+  sycl::event ev1 = q.submit([&](sycl::handler &cgh) {
+    cgh.depends_on(events);
+    cgh.single_task([=] {});
+  });
+
+  bool ev1_completed = false;
+  int try_count = 0;
+  while (true) {
+    auto ev1_status =
+        ev1.get_info<sycl::info::event::command_execution_status>();
+    if (ev1_status == sycl::info::event_command_status::complete) {
+      std::cout << "Ev1 has completed" << std::endl;
+      ev1_completed = true;
+      break;
+    }
+
+    std::cout << "Ev1 has not yet completed: ";
+    switch (ev1_status) {
+    case sycl::info::event_command_status::submitted:
+      std::cout << "submitted";
+      break;
+    case sycl::info::event_command_status::running:
+      std::cout << "running";
+      break;
+    default:
+      std::cout << "unrecognized";
+      break;
+    }
+    std::cout << std::endl;
+
+    std::chrono::milliseconds timespan(300);
+    std::this_thread::sleep_for(timespan);
+
+    try_count += 1;
+    if (try_count > 10) {
+      ev1.wait();
+    }
+  }
+  assert(ev1_completed);
+
+  sycl::event ev2 = q.submit([&](sycl::handler &cgh) {
+    cgh.depends_on(events);
+    cgh.single_task([=] {});
+  });
+  q.wait();
+
+  auto ev2_status = ev2.get_info<sycl::info::event::command_execution_status>();
+  if (ev2_status != sycl::info::event_command_status::complete) {
+    std::cout << "Test Fail" << std::endl;
+    exit(1);
+  }
+
+  std::cout << "Ev2 has completed" << std::endl;
+  std::cout << "Test Pass" << std::endl;
+  return 0;
+}
diff --git a/SYCL/Plugin/level_zero_batch_test.cpp b/SYCL/Plugin/level_zero_batch_test.cpp
new file mode 100644
index 0000000000..f834e2bfab
--- /dev/null
+++ b/SYCL/Plugin/level_zero_batch_test.cpp
@@ -0,0 +1,362 @@
+// REQUIRES: gpu, level_zero
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple  %s -o %t.out
+
+// Set batching to 4 explicitly
+// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=4 SYCL_PI_TRACE=2 ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 | FileCheck --check-prefixes=CKALL,CKB4 %s
+
+// Set batching to 1 explicitly
+// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=1 SYCL_PI_TRACE=2 ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 | FileCheck --check-prefixes=CKALL,CKB1 %s
+
+// Set batching to 3 explicitly
+// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=3 SYCL_PI_TRACE=2 ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 | FileCheck --check-prefixes=CKALL,CKB3 %s
+
+// Set batching to 5 explicitly
+// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=5 SYCL_PI_TRACE=2 ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 | FileCheck --check-prefixes=CKALL,CKB5 %s
+
+// Set batching to 7 explicitly
+// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=7 SYCL_PI_TRACE=2 ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 | FileCheck --check-prefixes=CKALL,CKB7 %s
+
+// Set batching to 8 explicitly
+// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=8 SYCL_PI_TRACE=2 ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 | FileCheck --check-prefixes=CKALL,CKB8 %s
+
+// Set batching to 9 explicitly
+// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=9 SYCL_PI_TRACE=2 ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 | FileCheck --check-prefixes=CKALL,CKB9 %s
+
+// level_zero_batch_test.cpp
+//
+// This tests the level zero plugin's kernel batching code.  The default
+// batching is 4, and exact batch size can be controlled with environment
+// variable SYCL_PI_LEVEL_ZEOR+BATCH_SIZE=N.
+// This test enqueues 8 kernels and then does a wait. And it does this 3 times.
+// Expected output is that for batching =1 you will see zeCommandListClose,
+// and zeCommandQueueExecuteCommandLists after every piEnqueueKernelLaunch.
+// For batching=3 you will see that after 3rd and 6th enqueues, and then after
+// piEventsWait. For 5, after 5th piEnqueue, and then after piEventsWait.  For
+// 4 you will see these after 4th and 8th Enqueue, and for 8, only after the
+// 8th enqueue.  And lastly for 9, you will see the Close and Execute calls
+// only after the piEventsWait.
+// Since the test does this 3 times, this pattern will repeat 2 more times,
+// and then the test will print Test Passed 8 times, once for each kernel
+// validation check.
+// Pattern starts first set of kernel executions.
+// CKALL: ---> piEnqueueKernelLaunch(
+// CKALL: ZE ---> zeCommandListAppendLaunchKernel(
+// CKB1:  ZE ---> zeCommandListClose(
+// CKB1:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKALL: ---> piEnqueueKernelLaunch(
+// CKALL: ZE ---> zeCommandListAppendLaunchKernel(
+// CKB1:  ZE ---> zeCommandListClose(
+// CKB1:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKALL: ---> piEnqueueKernelLaunch(
+// CKALL: ZE ---> zeCommandListAppendLaunchKernel(
+// CKB1:  ZE ---> zeCommandListClose(
+// CKB1:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKB3:  ZE ---> zeCommandListClose(
+// CKB3:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKALL: ---> piEnqueueKernelLaunch(
+// CKALL: ZE ---> zeCommandListAppendLaunchKernel(
+// CKB1:  ZE ---> zeCommandListClose(
+// CKB1:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKB4:  ZE ---> zeCommandListClose(
+// CKB4:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKALL: ---> piEnqueueKernelLaunch(
+// CKALL: ZE ---> zeCommandListAppendLaunchKernel(
+// CKB1:  ZE ---> zeCommandListClose(
+// CKB1:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKB5:  ZE ---> zeCommandListClose(
+// CKB5:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKALL: ---> piEnqueueKernelLaunch(
+// CKALL: ZE ---> zeCommandListAppendLaunchKernel(
+// CKB1:  ZE ---> zeCommandListClose(
+// CKB1:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKB3:  ZE ---> zeCommandListClose(
+// CKB3:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKALL: ---> piEnqueueKernelLaunch(
+// CKALL: ZE ---> zeCommandListAppendLaunchKernel(
+// CKB1:  ZE ---> zeCommandListClose(
+// CKB1:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKB7:  ZE ---> zeCommandListClose(
+// CKB7:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKALL: ---> piEnqueueKernelLaunch(
+// CKALL: ZE ---> zeCommandListAppendLaunchKernel(
+// CKB1:  ZE ---> zeCommandListClose(
+// CKB1:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKB4:  ZE ---> zeCommandListClose(
+// CKB4:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKB8:  ZE ---> zeCommandListClose(
+// CKB8:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKALL: ---> piEventsWait(
+// CKB3:  ZE ---> zeCommandListClose(
+// CKB3:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKB5:  ZE ---> zeCommandListClose(
+// CKB5:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKB7:  ZE ---> zeCommandListClose(
+// CKB7:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKB9:  ZE ---> zeCommandListClose(
+// CKB9:  ZE ---> zeCommandQueueExecuteCommandLists(
+// Pattern starts 2nd set of kernel executions
+// CKALL: ---> piEnqueueKernelLaunch(
+// CKALL: ZE ---> zeCommandListAppendLaunchKernel(
+// CKB1:  ZE ---> zeCommandListClose(
+// CKB1:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKALL: ---> piEnqueueKernelLaunch(
+// CKALL: ZE ---> zeCommandListAppendLaunchKernel(
+// CKB1:  ZE ---> zeCommandListClose(
+// CKB1:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKALL: ---> piEnqueueKernelLaunch(
+// CKALL: ZE ---> zeCommandListAppendLaunchKernel(
+// CKB1:  ZE ---> zeCommandListClose(
+// CKB1:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKB3:  ZE ---> zeCommandListClose(
+// CKB3:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKALL: ---> piEnqueueKernelLaunch(
+// CKALL: ZE ---> zeCommandListAppendLaunchKernel(
+// CKB1:  ZE ---> zeCommandListClose(
+// CKB1:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKB4:  ZE ---> zeCommandListClose(
+// CKB4:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKALL: ---> piEnqueueKernelLaunch(
+// CKALL: ZE ---> zeCommandListAppendLaunchKernel(
+// CKB1:  ZE ---> zeCommandListClose(
+// CKB1:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKB5:  ZE ---> zeCommandListClose(
+// CKB5:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKALL: ---> piEnqueueKernelLaunch(
+// CKALL: ZE ---> zeCommandListAppendLaunchKernel(
+// CKB1:  ZE ---> zeCommandListClose(
+// CKB1:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKB3:  ZE ---> zeCommandListClose(
+// CKB3:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKALL: ---> piEnqueueKernelLaunch(
+// CKALL: ZE ---> zeCommandListAppendLaunchKernel(
+// CKB1:  ZE ---> zeCommandListClose(
+// CKB1:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKB7:  ZE ---> zeCommandListClose(
+// CKB7:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKALL: ---> piEnqueueKernelLaunch(
+// CKALL: ZE ---> zeCommandListAppendLaunchKernel(
+// CKB1:  ZE ---> zeCommandListClose(
+// CKB1:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKB4:  ZE ---> zeCommandListClose(
+// CKB4:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKB8:  ZE ---> zeCommandListClose(
+// CKB8:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKALL: ---> piEventsWait(
+// CKB3:  ZE ---> zeCommandListClose(
+// CKB3:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKB5:  ZE ---> zeCommandListClose(
+// CKB5:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKB7:  ZE ---> zeCommandListClose(
+// CKB7:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKB9:  ZE ---> zeCommandListClose(
+// CKB9:  ZE ---> zeCommandQueueExecuteCommandLists(
+// Pattern starts 3rd set of kernel executions
+// CKALL: ---> piEnqueueKernelLaunch(
+// CKALL: ZE ---> zeCommandListAppendLaunchKernel(
+// CKB1:  ZE ---> zeCommandListClose(
+// CKB1:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKALL: ---> piEnqueueKernelLaunch(
+// CKALL: ZE ---> zeCommandListAppendLaunchKernel(
+// CKB1:  ZE ---> zeCommandListClose(
+// CKB1:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKALL: ---> piEnqueueKernelLaunch(
+// CKALL: ZE ---> zeCommandListAppendLaunchKernel(
+// CKB1:  ZE ---> zeCommandListClose(
+// CKB1:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKB3:  ZE ---> zeCommandListClose(
+// CKB3:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKALL: ---> piEnqueueKernelLaunch(
+// CKALL: ZE ---> zeCommandListAppendLaunchKernel(
+// CKB1:  ZE ---> zeCommandListClose(
+// CKB1:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKB4:  ZE ---> zeCommandListClose(
+// CKB4:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKALL: ---> piEnqueueKernelLaunch(
+// CKALL: ZE ---> zeCommandListAppendLaunchKernel(
+// CKB1:  ZE ---> zeCommandListClose(
+// CKB1:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKB5:  ZE ---> zeCommandListClose(
+// CKB5:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKALL: ---> piEnqueueKernelLaunch(
+// CKALL: ZE ---> zeCommandListAppendLaunchKernel(
+// CKB1:  ZE ---> zeCommandListClose(
+// CKB1:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKB3:  ZE ---> zeCommandListClose(
+// CKB3:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKALL: ---> piEnqueueKernelLaunch(
+// CKALL: ZE ---> zeCommandListAppendLaunchKernel(
+// CKB1:  ZE ---> zeCommandListClose(
+// CKB1:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKB7:  ZE ---> zeCommandListClose(
+// CKB7:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKALL: ---> piEnqueueKernelLaunch(
+// CKALL: ZE ---> zeCommandListAppendLaunchKernel(
+// CKB1:  ZE ---> zeCommandListClose(
+// CKB1:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKB4:  ZE ---> zeCommandListClose(
+// CKB4:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKB8:  ZE ---> zeCommandListClose(
+// CKB8:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKALL: ---> piEventsWait(
+// CKB3:  ZE ---> zeCommandListClose(
+// CKB3:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKB5:  ZE ---> zeCommandListClose(
+// CKB5:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKB7:  ZE ---> zeCommandListClose(
+// CKB7:  ZE ---> zeCommandQueueExecuteCommandLists(
+// CKB9:  ZE ---> zeCommandListClose(
+// CKB9:  ZE ---> zeCommandQueueExecuteCommandLists(
+// Now just check for 8 Test Pass kernel validations.
+// CKALL: Test Pass
+// CKALL: Test Pass
+// CKALL: Test Pass
+// CKALL: Test Pass
+// CKALL: Test Pass
+// CKALL: Test Pass
+// CKALL: Test Pass
+// CKALL: Test Pass
+
+#include "CL/sycl.hpp"
+#include <chrono>
+#include <cmath>
+#include <iostream>
+
+namespace sycl = cl::sycl;
+
+void validate(uint32_t *result, uint32_t *expect, size_t n) {
+  int error = 0;
+  for (int i = 0; i < n; i++) {
+    if (result[i] != expect[i]) {
+      error++;
+      if (error < 10) {
+        printf("Error: %d, expect: %d\n", result[i], expect[i]);
+      }
+    }
+  }
+  error > 0 ? printf("Error: %d\n", error) : printf("Test Pass\n");
+}
+
+int main(int argc, char *argv[]) {
+  size_t M = 65536;
+  size_t N = 512 / 4;
+  size_t AL = M * N * sizeof(uint32_t);
+
+  sycl::queue q(sycl::default_selector{});
+  auto ctx = q.get_context();
+  auto dev = q.get_device();
+
+  uint32_t *Y1 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
+  uint32_t *Z1 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
+  uint32_t *Z2 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
+  uint32_t *Z3 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
+  uint32_t *Z4 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
+  uint32_t *Z5 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
+  uint32_t *Z6 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
+  uint32_t *Z7 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
+  uint32_t *Z8 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
+
+  for (size_t i = 0; i < M * N; i++) {
+    Y1[i] = i % 255;
+  }
+
+  memset(Z1, '\0', AL);
+  memset(Z2, '\0', AL);
+  memset(Z3, '\0', AL);
+  memset(Z4, '\0', AL);
+  memset(Z5, '\0', AL);
+  memset(Z6, '\0', AL);
+  memset(Z7, '\0', AL);
+  memset(Z8, '\0', AL);
+
+  {
+    for (size_t j = 0; j < 3; j++) {
+      q.submit([&](sycl::handler &h) {
+        h.parallel_for<class u32_copy1>(sycl::range<2>{M, N},
+                                        [=](sycl::id<2> it) {
+                                          const int m = it[0];
+                                          const int n = it[1];
+                                          Z1[m * N + n] = Y1[m * N + n];
+                                        });
+      });
+      q.submit([&](sycl::handler &h) {
+        h.parallel_for<class u32_copy2>(sycl::range<2>{M, N},
+                                        [=](sycl::id<2> it) {
+                                          const int m = it[0];
+                                          const int n = it[1];
+                                          Z2[m * N + n] = Y1[m * N + n];
+                                        });
+      });
+      q.submit([&](sycl::handler &h) {
+        h.parallel_for<class u32_copy3>(sycl::range<2>{M, N},
+                                        [=](sycl::id<2> it) {
+                                          const int m = it[0];
+                                          const int n = it[1];
+                                          Z3[m * N + n] = Y1[m * N + n];
+                                        });
+      });
+      q.submit([&](sycl::handler &h) {
+        h.parallel_for<class u32_copy4>(sycl::range<2>{M, N},
+                                        [=](sycl::id<2> it) {
+                                          const int m = it[0];
+                                          const int n = it[1];
+                                          Z4[m * N + n] = Y1[m * N + n];
+                                        });
+      });
+      q.submit([&](sycl::handler &h) {
+        h.parallel_for<class u32_copy5>(sycl::range<2>{M, N},
+                                        [=](sycl::id<2> it) {
+                                          const int m = it[0];
+                                          const int n = it[1];
+                                          Z5[m * N + n] = Y1[m * N + n];
+                                        });
+      });
+      q.submit([&](sycl::handler &h) {
+        h.parallel_for<class u32_copy6>(sycl::range<2>{M, N},
+                                        [=](sycl::id<2> it) {
+                                          const int m = it[0];
+                                          const int n = it[1];
+                                          Z6[m * N + n] = Y1[m * N + n];
+                                        });
+      });
+      q.submit([&](sycl::handler &h) {
+        h.parallel_for<class u32_copy7>(sycl::range<2>{M, N},
+                                        [=](sycl::id<2> it) {
+                                          const int m = it[0];
+                                          const int n = it[1];
+                                          Z7[m * N + n] = Y1[m * N + n];
+                                        });
+      });
+      q.submit([&](sycl::handler &h) {
+        h.parallel_for<class u32_copy8>(sycl::range<2>{M, N},
+                                        [=](sycl::id<2> it) {
+                                          const int m = it[0];
+                                          const int n = it[1];
+                                          Z8[m * N + n] = Y1[m * N + n];
+                                        });
+      });
+
+      q.wait();
+    }
+  }
+  validate(Y1, Z1, M * N);
+  validate(Y1, Z2, M * N);
+  validate(Y1, Z3, M * N);
+  validate(Y1, Z4, M * N);
+  validate(Y1, Z5, M * N);
+  validate(Y1, Z6, M * N);
+  validate(Y1, Z7, M * N);
+  validate(Y1, Z8, M * N);
+
+  sycl::free(Y1, ctx);
+  sycl::free(Z1, ctx);
+  sycl::free(Z2, ctx);
+  sycl::free(Z3, ctx);
+  sycl::free(Z4, ctx);
+  sycl::free(Z5, ctx);
+  sycl::free(Z6, ctx);
+  sycl::free(Z7, ctx);
+  sycl::free(Z8, ctx);
+
+  return 0;
+}
diff --git a/SYCL/Plugin/level_zero_dynamic_batch_test.cpp b/SYCL/Plugin/level_zero_dynamic_batch_test.cpp
new file mode 100644
index 0000000000..bc07034e05
--- /dev/null
+++ b/SYCL/Plugin/level_zero_dynamic_batch_test.cpp
@@ -0,0 +1,220 @@
+// REQUIRES: gpu, level_zero
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple  %s -o %t.out
+
+// Check that dynamic batching raises/lowers batch size
+// RUN: env SYCL_PI_TRACE=2 ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 | FileCheck --check-prefixes=CKALL,CKDYN %s
+
+// level_zero_dynamic_batch_test.cpp
+//
+// This tests the level zero plugin's kernel dynamic batch size adjustment
+// code.
+// It starts out by enqueing 40 kernels before it does a wait, and it does
+// this 5 times.  That should cause the dynamic batch size adjustment to
+// raise the batch size up 3 times.
+//
+// Then the test starts enqueueing only 4 kernels before doing a wait, and
+// it does that 20 times.  That should cause the batch size to
+// be lowered to be less than 4.
+//
+// CKDYN: Raising QueueBatchSize to 5
+// CKDYN: Raising QueueBatchSize to 6
+// CKDYN: Raising QueueBatchSize to 7
+// CKDYN-NOT: Raising QueueBatchSize
+// CKALL: Test Pass
+// CKALL: Test Pass
+// CKALL: Test Pass
+// CKALL: Test Pass
+// CKALL: Test Pass
+// CKALL: Test Pass
+// CKALL: Test Pass
+// CKDYN: Lowering QueueBatchSize to 2
+// CKDYN-NOT: Lowering QueueBatchSize
+// CKALL: Test Pass
+// CKALL: Test Pass
+// CKALL: Test Pass
+// CKALL: Test Pass
+
+#include "CL/sycl.hpp"
+#include <chrono>
+#include <cmath>
+#include <iostream>
+
+namespace sycl = cl::sycl;
+
+void validate(uint32_t *result, uint32_t *expect, size_t n) {
+  int error = 0;
+  for (int i = 0; i < n; i++) {
+    if (result[i] != expect[i]) {
+      error++;
+      if (error < 10) {
+        printf("Error: %d, expect: %d\n", result[i], expect[i]);
+      }
+    }
+  }
+  error > 0 ? printf("Error: %d\n", error) : printf("Test Pass\n");
+}
+
+int main(int argc, char *argv[]) {
+  size_t M = 65536;
+  size_t N = 512 / 4;
+  size_t AL = M * N * sizeof(uint32_t);
+
+  sycl::queue q(sycl::default_selector{});
+  auto ctx = q.get_context();
+  auto dev = q.get_device();
+
+  uint32_t *Y1 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
+  uint32_t *Z1 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
+  uint32_t *Z2 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
+  uint32_t *Z3 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
+  uint32_t *Z4 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
+  uint32_t *Z5 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
+  uint32_t *Z6 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
+  uint32_t *Z7 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
+  uint32_t *Z8 = static_cast<uint32_t *>(sycl::malloc_shared(AL, dev, ctx));
+
+  for (size_t i = 0; i < M * N; i++) {
+    Y1[i] = i % 255;
+  }
+
+  memset(Z1, '\0', AL);
+  memset(Z2, '\0', AL);
+  memset(Z3, '\0', AL);
+  memset(Z4, '\0', AL);
+  memset(Z5, '\0', AL);
+  memset(Z6, '\0', AL);
+  memset(Z7, '\0', AL);
+  memset(Z8, '\0', AL);
+
+  for (size_t i = 0; i < 5; i++) {
+    for (size_t j = 0; j < 5; j++) {
+      q.submit([&](sycl::handler &h) {
+        h.parallel_for<class u32_copy1>(sycl::range<2>{M, N},
+                                        [=](sycl::id<2> it) {
+                                          const int m = it[0];
+                                          const int n = it[1];
+                                          Z1[m * N + n] = Y1[m * N + n];
+                                        });
+      });
+      q.submit([&](sycl::handler &h) {
+        h.parallel_for<class u32_copy2>(sycl::range<2>{M, N},
+                                        [=](sycl::id<2> it) {
+                                          const int m = it[0];
+                                          const int n = it[1];
+                                          Z2[m * N + n] = Y1[m * N + n];
+                                        });
+      });
+      q.submit([&](sycl::handler &h) {
+        h.parallel_for<class u32_copy3>(sycl::range<2>{M, N},
+                                        [=](sycl::id<2> it) {
+                                          const int m = it[0];
+                                          const int n = it[1];
+                                          Z3[m * N + n] = Y1[m * N + n];
+                                        });
+      });
+      q.submit([&](sycl::handler &h) {
+        h.parallel_for<class u32_copy4>(sycl::range<2>{M, N},
+                                        [=](sycl::id<2> it) {
+                                          const int m = it[0];
+                                          const int n = it[1];
+                                          Z4[m * N + n] = Y1[m * N + n];
+                                        });
+      });
+      q.submit([&](sycl::handler &h) {
+        h.parallel_for<class u32_copy5>(sycl::range<2>{M, N},
+                                        [=](sycl::id<2> it) {
+                                          const int m = it[0];
+                                          const int n = it[1];
+                                          Z5[m * N + n] = Y1[m * N + n];
+                                        });
+      });
+      q.submit([&](sycl::handler &h) {
+        h.parallel_for<class u32_copy6>(sycl::range<2>{M, N},
+                                        [=](sycl::id<2> it) {
+                                          const int m = it[0];
+                                          const int n = it[1];
+                                          Z6[m * N + n] = Y1[m * N + n];
+                                        });
+      });
+      q.submit([&](sycl::handler &h) {
+        h.parallel_for<class u32_copy7>(sycl::range<2>{M, N},
+                                        [=](sycl::id<2> it) {
+                                          const int m = it[0];
+                                          const int n = it[1];
+                                          Z7[m * N + n] = Y1[m * N + n];
+                                        });
+      });
+      q.submit([&](sycl::handler &h) {
+        h.parallel_for<class u32_copy8>(sycl::range<2>{M, N},
+                                        [=](sycl::id<2> it) {
+                                          const int m = it[0];
+                                          const int n = it[1];
+                                          Z8[m * N + n] = Y1[m * N + n];
+                                        });
+      });
+    }
+    q.wait();
+  }
+
+  validate(Y1, Z1, M * N);
+  validate(Y1, Z2, M * N);
+  validate(Y1, Z3, M * N);
+  validate(Y1, Z4, M * N);
+  validate(Y1, Z5, M * N);
+  validate(Y1, Z6, M * N);
+  validate(Y1, Z7, M * N);
+  validate(Y1, Z8, M * N);
+
+  for (size_t i = 0; i < 20; i++) {
+    q.submit([&](sycl::handler &h) {
+      h.parallel_for<class u32_copy9>(sycl::range<2>{M, N},
+                                      [=](sycl::id<2> it) {
+                                        const int m = it[0];
+                                        const int n = it[1];
+                                        Z1[m * N + n] = Y1[m * N + n];
+                                      });
+    });
+    q.submit([&](sycl::handler &h) {
+      h.parallel_for<class u32_copy10>(sycl::range<2>{M, N},
+                                       [=](sycl::id<2> it) {
+                                         const int m = it[0];
+                                         const int n = it[1];
+                                         Z2[m * N + n] = Y1[m * N + n];
+                                       });
+    });
+    q.submit([&](sycl::handler &h) {
+      h.parallel_for<class u32_copy11>(sycl::range<2>{M, N},
+                                       [=](sycl::id<2> it) {
+                                         const int m = it[0];
+                                         const int n = it[1];
+                                         Z3[m * N + n] = Y1[m * N + n];
+                                       });
+    });
+    q.submit([&](sycl::handler &h) {
+      h.parallel_for<class u32_copy12>(sycl::range<2>{M, N},
+                                       [=](sycl::id<2> it) {
+                                         const int m = it[0];
+                                         const int n = it[1];
+                                         Z4[m * N + n] = Y1[m * N + n];
+                                       });
+    });
+    q.wait();
+  }
+  validate(Y1, Z1, M * N);
+  validate(Y1, Z2, M * N);
+  validate(Y1, Z3, M * N);
+  validate(Y1, Z4, M * N);
+
+  sycl::free(Y1, ctx);
+  sycl::free(Z1, ctx);
+  sycl::free(Z2, ctx);
+  sycl::free(Z3, ctx);
+  sycl::free(Z4, ctx);
+  sycl::free(Z5, ctx);
+  sycl::free(Z6, ctx);
+  sycl::free(Z7, ctx);
+  sycl::free(Z8, ctx);
+
+  return 0;
+}
diff --git a/SYCL/Regression/device_num.cpp b/SYCL/Regression/device_num.cpp
new file mode 100644
index 0000000000..6fbfc093b8
--- /dev/null
+++ b/SYCL/Regression/device_num.cpp
@@ -0,0 +1,96 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_DEVICE_FILTER=0 %t.out
+// RUN: env SYCL_DEVICE_FILTER=1 %t.out
+// RUN: env SYCL_DEVICE_FILTER=2 %t.out
+// RUN: env SYCL_DEVICE_FILTER=3 %t.out
+// RUN: env SYCL_DEVICE_FILTER=4 %t.out
+
+// The test is using all available BEs but CUDA machine in CI does not have
+// functional OpenCL RT
+// UNSUPPORTED: cuda
+
+#include <CL/sycl.hpp>
+#include <iostream>
+
+using namespace cl::sycl;
+using namespace std;
+
+void printDeviceType(const device &d) {
+  string name = d.get_platform().get_info<info::platform::name>();
+  auto DeviceType = d.get_info<info::device::device_type>();
+  std::string DeviceTypeName;
+
+  switch (DeviceType) {
+  case info::device_type::cpu:
+    DeviceTypeName = "CPU ";
+    break;
+  case info::device_type::gpu:
+    DeviceTypeName = "GPU ";
+    break;
+  case info::device_type::host:
+    DeviceTypeName = "HOST ";
+    break;
+  case info::device_type::accelerator:
+    DeviceTypeName = "ACCELERATOR ";
+    break;
+  default:
+    DeviceTypeName = "UNKNOWN ";
+    break;
+  }
+  std::cout << DeviceTypeName << name << std::endl;
+}
+
+int main() {
+  const char *envVal = std::getenv("SYCL_DEVICE_FILTER");
+  int deviceNum;
+  std::cout << "SYCL_DEVICE_FILTER=" << envVal << std::endl;
+  deviceNum = std::atoi(envVal);
+
+  auto devices = device::get_devices();
+  if (devices.size() > deviceNum) {
+    device targetDevice = devices[deviceNum];
+    std::cout << "Target Device: ";
+    printDeviceType(targetDevice);
+
+    {
+      default_selector ds;
+      device d = ds.select_device();
+      std::cout << "default_selector selected ";
+      printDeviceType(d);
+      assert(targetDevice == d &&
+             "The selected device is not the target device specified.");
+    }
+
+    if (targetDevice.is_gpu()) {
+      gpu_selector gs;
+      device d = gs.select_device();
+      std::cout << "gpu_selector selected ";
+      printDeviceType(d);
+      assert(targetDevice == d &&
+             "The selected device is not the target device specified.");
+    } else if (targetDevice.is_cpu()) {
+      cpu_selector cs;
+      device d = cs.select_device();
+      std::cout << "cpu_selector selected ";
+      printDeviceType(d);
+      assert(targetDevice == d &&
+             "The selected device is not the target device specified.");
+    } else if (targetDevice.is_accelerator()) {
+      accelerator_selector as;
+      device d = as.select_device();
+      std::cout << "accelerator_selector selected ";
+      printDeviceType(d);
+      assert(targetDevice == d &&
+             "The selected device is not the target device specified.");
+    }
+    // HOST device is always available regardless of SYCL_DEVICE_FILTER
+    {
+      host_selector hs;
+      device d = hs.select_device();
+      std::cout << "host_selector selected ";
+      printDeviceType(d);
+      assert(d.is_host() && "The selected device is not a host device.");
+    }
+  }
+  return 0;
+}
diff --git a/SYCL/SpecConstants/2020/host_apis.cpp b/SYCL/SpecConstants/2020/host_apis.cpp
new file mode 100644
index 0000000000..c373291938
--- /dev/null
+++ b/SYCL/SpecConstants/2020/host_apis.cpp
@@ -0,0 +1,105 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %t.out
+
+// UNSUPPORTED: cuda
+// UNSUPPORTED: rocm_nvidia
+// UNSUPPORTED: rocm_amd
+
+#include <sycl/sycl.hpp>
+
+#include <cmath>
+
+class Kernel1Name;
+class Kernel2Name;
+
+struct TestStruct {
+  int a;
+  int b;
+};
+
+struct TestStruct2 {
+  bool a;
+  long b;
+};
+
+const static sycl::specialization_id<int> SpecConst1{42};
+const static sycl::specialization_id<int> SpecConst2{42};
+const static sycl::specialization_id<TestStruct> SpecConst3{TestStruct{42, 42}};
+const static sycl::specialization_id<short> SpecConst4{42};
+const static sycl::specialization_id<TestStruct2> SpecConst5{
+    TestStruct2{true, 0}};
+
+int main() {
+  sycl::queue Q;
+
+  // No support for host device so far
+  if (Q.is_host())
+    return 0;
+
+  // The code is needed to just have device images in the executable
+  if (0) {
+    Q.submit([](sycl::handler &CGH) { CGH.single_task<Kernel1Name>([] {}); });
+    Q.submit([](sycl::handler &CGH) { CGH.single_task<Kernel2Name>([] {}); });
+  }
+
+  const sycl::context Ctx = Q.get_context();
+  const sycl::device Dev = Q.get_device();
+
+  sycl::kernel_bundle KernelBundle =
+      sycl::get_kernel_bundle<sycl::bundle_state::input>(Ctx, {Dev});
+
+  assert(KernelBundle.contains_specialization_constants() == true);
+  assert(KernelBundle.has_specialization_constant<SpecConst1>() == false);
+  assert(KernelBundle.has_specialization_constant<SpecConst2>() == true);
+
+  // Test that unused spec constants are saved.
+  assert(KernelBundle.get_specialization_constant<SpecConst1>() == 42);
+  KernelBundle.set_specialization_constant<SpecConst1>(1);
+  assert(KernelBundle.get_specialization_constant<SpecConst1>() == 1);
+
+  KernelBundle.set_specialization_constant<SpecConst2>(1);
+  {
+    auto ExecBundle = sycl::build(KernelBundle);
+    assert(ExecBundle.get_specialization_constant<SpecConst1>() == 1);
+    assert(ExecBundle.get_specialization_constant<SpecConst2>() == 1);
+    sycl::buffer<int, 1> Buf{sycl::range{1}};
+    Q.submit([&](sycl::handler &CGH) {
+      CGH.use_kernel_bundle(ExecBundle);
+      auto Acc = Buf.get_access<sycl::access::mode::read_write>(CGH);
+      CGH.single_task<class Kernel3Name>([=](sycl::kernel_handler KH) {
+        Acc[0] = KH.get_specialization_constant<SpecConst2>();
+      });
+    });
+    auto Acc = Buf.get_access<sycl::access::mode::read>();
+    assert(Acc[0] == 1);
+  }
+
+  {
+    sycl::buffer<TestStruct, 1> Buf{sycl::range{1}};
+    Q.submit([&](sycl::handler &CGH) {
+      auto Acc = Buf.get_access<sycl::access::mode::read_write>(CGH);
+      CGH.set_specialization_constant<SpecConst3>(TestStruct{1, 2});
+      const auto SC = CGH.get_specialization_constant<SpecConst4>();
+      assert(SC == 42);
+      CGH.single_task<class Kernel4Name>([=](sycl::kernel_handler KH) {
+        Acc[0] = KH.get_specialization_constant<SpecConst3>();
+      });
+    });
+    auto Acc = Buf.get_access<sycl::access::mode::read>();
+    assert(Acc[0].a == 1 && Acc[0].b == 2);
+  }
+
+  sycl::kernel_bundle KernelBundle2 =
+      sycl::get_kernel_bundle<sycl::bundle_state::input>(Ctx, {Dev});
+  KernelBundle2.set_specialization_constant<SpecConst5>(TestStruct2{false, 1});
+
+  sycl::kernel_bundle KernelBundle3 =
+      sycl::join<sycl::bundle_state::input>({KernelBundle2, KernelBundle});
+
+  assert(KernelBundle3.get_specialization_constant<SpecConst5>().a == false);
+  assert(KernelBundle3.get_specialization_constant<SpecConst5>().b == 1);
+  assert(KernelBundle3.get_specialization_constant<SpecConst1>() == 1);
+  assert(KernelBundle3.get_specialization_constant<SpecConst2>() == 1);
+
+  return 0;
+}
diff --git a/SYCL/SpecConstants/2020/kernel_lambda_with_kernel_handler_arg.cpp b/SYCL/SpecConstants/2020/kernel_lambda_with_kernel_handler_arg.cpp
new file mode 100644
index 0000000000..dd7a27b8db
--- /dev/null
+++ b/SYCL/SpecConstants/2020/kernel_lambda_with_kernel_handler_arg.cpp
@@ -0,0 +1,111 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+//
+// Hits an assert in the Lower Work Group Scope Code pass on AMD:
+// XFAIL: rocm_amd
+
+// This test checks all possible scenarios of running single_task, parallel_for
+// and parallel_for_work_group to verify that this code compiles and runs
+// correctly with user's lambda with and without sycl::kernel_handler argument
+
+#include <CL/sycl.hpp>
+
+int main() {
+  sycl::queue q;
+
+  // single_task w/o kernel_handler arg
+  q.submit([&](sycl::handler &cgh) {
+    cgh.single_task<class KernelSingleTaskWOKernelHandler>([=]() {});
+  });
+
+  // single_task with kernel_handler arg
+  q.submit([&](sycl::handler &cgh) {
+    cgh.single_task<class KernelSingleTaskWithKernelHandler>(
+        [=](sycl::kernel_handler kh) {});
+  });
+
+  // parallel_for with id and w/o kernel_handler arg
+  q.submit([&](sycl::handler &cgh) {
+    cgh.parallel_for<class KernelParallelForIdWithoutKernelHandler>(
+        sycl::range<1>(1), [](sycl::id<1> i) {});
+  });
+
+  // parallel_for with id and kernel_handler args
+  q.submit([&](sycl::handler &cgh) {
+    cgh.parallel_for<class KernelParallelForIdWithKernelHandler>(
+        sycl::range<1>(1), [](sycl::id<1> i, sycl::kernel_handler kh) {});
+  });
+
+  // parallel_for with item and w/o kernel_handler arg
+  q.submit([&](sycl::handler &cgh) {
+    cgh.parallel_for<class KernelParallelForItemWithoutKernelHandler>(
+        sycl::range<3>(3, 3, 3), [](sycl::item<3> it) {});
+  });
+
+  // parallel_for with item and kernel_handler args
+  q.submit([&](sycl::handler &cgh) {
+    cgh.parallel_for<class KernelParallelForItemWithKernelHandler>(
+        sycl::range<3>(3, 3, 3),
+        [](sycl::item<3> it, sycl::kernel_handler kh) {});
+  });
+
+  // parallel_for with nd_item and w/o kernel_handler arg
+  q.submit([&](sycl::handler &cgh) {
+    cgh.parallel_for<class KernelParallelForNDItemWithoutKernelHandler>(
+        sycl::nd_range<3>(sycl::range<3>(4, 4, 4), sycl::range<3>(2, 2, 2)),
+        [=](sycl::nd_item<3> item) {});
+  });
+
+  // parallel_for with nd_item and kernel_handler args
+  q.submit([&](sycl::handler &cgh) {
+    cgh.parallel_for<class KernelParallelForNDItemWithKernelHandler>(
+        sycl::nd_range<3>(sycl::range<3>(4, 4, 4), sycl::range<3>(2, 2, 2)),
+        [=](sycl::nd_item<3> item, sycl::kernel_handler kh) {});
+  });
+
+  // parallel_for with generic lambda w/o kernel_handler arg
+  q.submit([&](sycl::handler &cgh) {
+    cgh.parallel_for<class KernelParallelForGenericLambdaWithoutKernelHandler>(
+        sycl::range<3>(3, 3, 3), [](auto it) {});
+  });
+
+  // parallel_for with generic lambda with kernel_handler arg
+  q.submit([&](sycl::handler &cgh) {
+    cgh.parallel_for<class KernelParallelForGenericLambdaWithKernelHandler>(
+        sycl::range<3>(3, 3, 3), [](auto it, sycl::kernel_handler kh) {});
+  });
+
+  // parallel_for with integral type arg and w/o kernel_handler arg
+  q.submit([&](sycl::handler &cgh) {
+    cgh.parallel_for<class KernelParallelForIntWithoutKernelHandler>(
+        sycl::range<1>(1), [](int index) {});
+  });
+
+  // parallel_for with integral type and kernel_handler args
+  q.submit([&](sycl::handler &cgh) {
+    cgh.parallel_for<class KernelParallelForIntWithKernelHandler>(
+        sycl::range<1>(1), [](int index, sycl::kernel_handler kh) {});
+  });
+
+  // parallel_for_work_group without kernel_handler arg
+  q.submit([&](sycl::handler &cgh) {
+    cgh.parallel_for_work_group<
+        class KernelParallelForWorkGroupWithKernelHandler>(
+        sycl::range<3>(2, 2, 2), sycl::range<3>(2, 2, 2),
+        [=](sycl::group<3> myGroup) {
+          myGroup.parallel_for_work_item([&](sycl::h_item<3> myItem) {});
+          myGroup.parallel_for_work_item([&](sycl::h_item<3> myItem) {});
+        });
+  });
+
+  // parallel_for_work_group with kernel_handler arg
+  q.submit([&](sycl::handler &cgh) {
+    cgh.parallel_for_work_group<
+        class KernelParallelForWorkGroupWithoutKernelHandler>(
+        sycl::range<3>(2, 2, 2), sycl::range<3>(2, 2, 2),
+        [=](sycl::group<3> myGroup, sycl::kernel_handler kh) {
+          myGroup.parallel_for_work_item([&](sycl::h_item<3> myItem) {});
+          myGroup.parallel_for_work_item([&](sycl::h_item<3> myItem) {});
+        });
+  });
+}
diff --git a/SYCL/SpecConstants/2020/non_native/Inputs/common.cpp b/SYCL/SpecConstants/2020/non_native/Inputs/common.cpp
new file mode 100644
index 0000000000..94bf445f11
--- /dev/null
+++ b/SYCL/SpecConstants/2020/non_native/Inputs/common.cpp
@@ -0,0 +1,54 @@
+#include <sycl/sycl.hpp>
+
+#include <cmath>
+
+class Kernel1Name;
+class Kernel2Name;
+
+struct TestStruct {
+  int a;
+  int b;
+};
+
+const static sycl::specialization_id<int> SpecConst1{42};
+const static sycl::specialization_id<int> SpecConst2{42};
+const static sycl::specialization_id<TestStruct> SpecConst3{TestStruct{42, 42}};
+const static sycl::specialization_id<short> SpecConst4{42};
+
+int main() {
+  sycl::queue Q;
+
+  // No support for host device so far
+  if (Q.is_host())
+    return 0;
+
+  {
+    sycl::buffer<int, 1> Buf{sycl::range{1}};
+    Q.submit([&](sycl::handler &CGH) {
+      CGH.set_specialization_constant<SpecConst2>(1);
+      auto Acc = Buf.get_access<sycl::access::mode::read_write>(CGH);
+      CGH.single_task<class Kernel1Name>([=](sycl::kernel_handler KH) {
+        Acc[0] = KH.get_specialization_constant<SpecConst2>();
+      });
+    });
+    auto Acc = Buf.get_access<sycl::access::mode::read>();
+    assert(Acc[0] == 1);
+  }
+
+  {
+    sycl::buffer<TestStruct, 1> Buf{sycl::range{1}};
+    Q.submit([&](sycl::handler &CGH) {
+      auto Acc = Buf.get_access<sycl::access::mode::read_write>(CGH);
+      CGH.set_specialization_constant<SpecConst3>(TestStruct{1, 2});
+      const auto SC = CGH.get_specialization_constant<SpecConst4>();
+      assert(SC == 42);
+      CGH.single_task<class Kernel2Name>([=](sycl::kernel_handler KH) {
+        Acc[0] = KH.get_specialization_constant<SpecConst3>();
+      });
+    });
+    auto Acc = Buf.get_access<sycl::access::mode::read>();
+    assert(Acc[0].a == 1 && Acc[0].b == 2);
+  }
+
+  return 0;
+}
diff --git a/SYCL/SpecConstants/2020/non_native/accelerator.cpp b/SYCL/SpecConstants/2020/non_native/accelerator.cpp
new file mode 100644
index 0000000000..c11958d10d
--- /dev/null
+++ b/SYCL/SpecConstants/2020/non_native/accelerator.cpp
@@ -0,0 +1,7 @@
+// REQUIRES: aoc, accelerator
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_fpga-unknown-unknown-sycldevice %S/Inputs/common.cpp -o %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test checks correctness of SYCL2020 non-native specialization constants
+// on accelerator device
diff --git a/SYCL/SpecConstants/2020/non_native/aot_w_kernel_handler_wo_spec_consts.cpp b/SYCL/SpecConstants/2020/non_native/aot_w_kernel_handler_wo_spec_consts.cpp
new file mode 100644
index 0000000000..f0a71e5110
--- /dev/null
+++ b/SYCL/SpecConstants/2020/non_native/aot_w_kernel_handler_wo_spec_consts.cpp
@@ -0,0 +1,37 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+//
+// Hits an assert in the Lower Work Group Scope Code pass on AMD:
+// XFAIL: rocm_amd
+
+// This test checks correctness of compiling and running of application with
+// kernel lambdas containing kernel_handler arguments and w/o usage of
+// specialization constants in AOT mode
+
+#include <CL/sycl.hpp>
+
+int main() {
+  sycl::queue q;
+
+  q.submit([&](sycl::handler &cgh) {
+    cgh.single_task<class KernelSingleTaskWithKernelHandler>(
+        [=](sycl::kernel_handler kh) {});
+  });
+
+  q.submit([&](sycl::handler &cgh) {
+    cgh.parallel_for<class KernelParallelForNDItemWithKernelHandler>(
+        sycl::nd_range<3>(sycl::range<3>(4, 4, 4), sycl::range<3>(2, 2, 2)),
+        [=](sycl::nd_item<3> item, sycl::kernel_handler kh) {});
+  });
+
+  // parallel_for_work_group with kernel_handler arg
+  q.submit([&](sycl::handler &cgh) {
+    cgh.parallel_for_work_group<
+        class KernelParallelForWorkGroupWithoutKernelHandler>(
+        sycl::range<3>(2, 2, 2), sycl::range<3>(2, 2, 2),
+        [=](sycl::group<3> myGroup, sycl::kernel_handler kh) {
+          myGroup.parallel_for_work_item([&](sycl::h_item<3> myItem) {});
+          myGroup.parallel_for_work_item([&](sycl::h_item<3> myItem) {});
+        });
+  });
+}
diff --git a/SYCL/SpecConstants/2020/non_native/cpu.cpp b/SYCL/SpecConstants/2020/non_native/cpu.cpp
new file mode 100644
index 0000000000..a2c91f1b58
--- /dev/null
+++ b/SYCL/SpecConstants/2020/non_native/cpu.cpp
@@ -0,0 +1,7 @@
+// REQUIRES: opencl-aot, cpu
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64-unknown-unknown-sycldevice %S/Inputs/common.cpp -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+
+// This test checks correctness of SYCL2020 non-native specialization constants
+// on CPU device
diff --git a/SYCL/SpecConstants/2020/non_native/cuda.cpp b/SYCL/SpecConstants/2020/non_native/cuda.cpp
new file mode 100644
index 0000000000..01d0d37e04
--- /dev/null
+++ b/SYCL/SpecConstants/2020/non_native/cuda.cpp
@@ -0,0 +1,10 @@
+// REQUIRES: cuda
+
+// RUN: %clangxx -fsycl -fsycl-targets=nvptx64-unknown-unknown-sycldevice %S/Inputs/common.cpp -o %t.out
+// RUN: env SYCL_DEVICE_FILTER=cuda %t.out
+
+// TODO: enable this test then compile-time error in sycl-post-link is fixed
+// UNSUPPORTED: cuda
+
+// This test checks correctness of SYCL2020 non-native specialization constants
+// on CUDA device
diff --git a/SYCL/SpecConstants/2020/non_native/gpu.cpp b/SYCL/SpecConstants/2020/non_native/gpu.cpp
new file mode 100644
index 0000000000..879fc95e94
--- /dev/null
+++ b/SYCL/SpecConstants/2020/non_native/gpu.cpp
@@ -0,0 +1,13 @@
+// REQUIRES: ocloc, gpu, TEMPORARY_DISABLED
+// UNSUPPORTED: cuda
+// CUDA is not compatible with SPIR.
+//
+// UNSUPPORTED: rocm_nvidia
+// UNSUPPORTED: rocm_amd
+// ROCm is not compatible with SPIR.
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend=spir64_gen-unknown-unknown-sycldevice "-device *" %S/Inputs/common.cpp -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// This test checks correctness of SYCL2020 non-native specialization constants
+// on GPU device