From 475cb79bb86ec5accd80333d99f65411ce92c822 Mon Sep 17 00:00:00 2001
From: Vladimir Lazarev <vladimir.lazarev@intel.com>
Date: Thu, 21 May 2020 11:39:19 +0300
Subject: [PATCH] [SYCL] dpc++ tests

---
 SYCL/Basic/CMakeLists.txt                     |  16 +
 SYCL/Basic/README.md                          |  90 +++
 SYCL/Basic/aot/Inputs/aot.cpp                 |  76 ++
 SYCL/Basic/aot/accelerator.cpp                |  13 +
 SYCL/Basic/aot/cpu.cpp                        |  12 +
 SYCL/Basic/aot/gpu.cpp                        |  14 +
 SYCL/Basic/aot/spec_const_aot.cpp             |  66 ++
 SYCL/Basic/aot/with-llvm-bc.cpp               |  17 +
 SYCL/Basic/bit_cast/bit_cast.cpp              |  84 +++
 SYCL/Basic/built-ins/nan.cpp                  |  72 ++
 SYCL/Basic/built-ins/printf.cpp               | 134 ++++
 SYCL/Basic/built-ins/scalar_common.cpp        |  34 +
 SYCL/Basic/built-ins/scalar_geometric.cpp     | 131 ++++
 SYCL/Basic/built-ins/scalar_integer.cpp       | 571 ++++++++++++++
 SYCL/Basic/built-ins/scalar_math.cpp          | 401 ++++++++++
 SYCL/Basic/built-ins/scalar_math_2.cpp        | 244 ++++++
 SYCL/Basic/built-ins/scalar_relational.cpp    | 422 +++++++++++
 SYCL/Basic/built-ins/vector_common.cpp        |  57 ++
 SYCL/Basic/built-ins/vector_geometric.cpp     | 171 +++++
 SYCL/Basic/built-ins/vector_integer.cpp       | 701 ++++++++++++++++++
 SYCL/Basic/built-ins/vector_math.cpp          | 210 ++++++
 SYCL/Basic/built-ins/vector_relational.cpp    | 608 +++++++++++++++
 SYCL/Basic/config/allowlist.cpp               |  90 +++
 SYCL/Basic/config/config.cpp                  |  26 +
 .../Inputs/split-per-source-second-file.cpp   |  21 +
 .../Inputs/split-per-source.h                 |   7 +
 .../device-code-split/aot-accelerator.cpp     |   5 +
 SYCL/Basic/device-code-split/aot-cpu.cpp      |   4 +
 SYCL/Basic/device-code-split/aot-gpu.cpp      |  11 +
 .../device-code-split/split-per-kernel.cpp    |  68 ++
 .../split-per-source-main.cpp                 |  54 ++
 SYCL/Basic/devicelib/assert-windows.cpp       |  75 ++
 SYCL/Basic/devicelib/assert.cpp               | 215 ++++++
 .../devicelib/c99_complex_math_fp64_test.cpp  | 256 +++++++
 .../Basic/devicelib/c99_complex_math_test.cpp | 258 +++++++
 SYCL/Basic/devicelib/cmath_fp64_test.cpp      | 118 +++
 SYCL/Basic/devicelib/cmath_test.cpp           | 115 +++
 SYCL/Basic/devicelib/math_fp64_test.cpp       | 115 +++
 .../devicelib/math_fp64_windows_test.cpp      | 132 ++++
 SYCL/Basic/devicelib/math_override_test.cpp   |  49 ++
 SYCL/Basic/devicelib/math_test.cpp            | 113 +++
 SYCL/Basic/devicelib/math_utils.hpp           |  29 +
 SYCL/Basic/devicelib/math_windows_test.cpp    | 121 +++
 .../devicelib/std_complex_math_fp64_test.cpp  | 206 +++++
 .../Basic/devicelib/std_complex_math_test.cpp | 204 +++++
 .../Basic/enqueue_barrier/enqueue_barrier.cpp |  78 ++
 .../feature-tests/inline-asm/asm_16_empty.cpp |  40 +
 .../inline-asm/asm_16_matrix_mult.cpp         |  44 ++
 .../inline-asm/asm_16_no_input_int.cpp        |  44 ++
 .../inline-asm/asm_16_no_opts.cpp             |  45 ++
 .../feature-tests/inline-asm/asm_8_empty.cpp  |  40 +
 .../inline-asm/asm_8_no_input_int.cpp         |  44 ++
 .../inline-asm/asm_arbitrary_ops_order.cpp    |  59 ++
 .../inline-asm/asm_decl_in_scope.cpp          |  67 ++
 .../inline-asm/asm_float_add.cpp              |  59 ++
 .../inline-asm/asm_float_imm_arg.cpp          |  56 ++
 .../inline-asm/asm_float_neg.cpp              |  57 ++
 .../feature-tests/inline-asm/asm_imm_arg.cpp  |  55 ++
 .../feature-tests/inline-asm/asm_mul.cpp      |  57 ++
 .../inline-asm/asm_multiple_instructions.cpp  |  59 ++
 .../inline-asm/asm_no_operands.cpp            |  34 +
 .../inline-asm/asm_no_output.cpp              |  47 ++
 .../feature-tests/inline-asm/asm_plus_mod.cpp |  58 ++
 .../inline-asm/include/asmhelper.h            | 128 ++++
 .../inline-asm/letter_example.cpp             |  66 ++
 .../inline-asm/malloc_shared_32.cpp           |  92 +++
 .../inline-asm/malloc_shared_in_out_dif.cpp   |  69 ++
 .../inline-asm/malloc_shared_no_input.cpp     |  61 ++
 SYCL/Basic/fpga_tests/Inputs/fpga_device.cpp  |  24 +
 SYCL/Basic/fpga_tests/Inputs/fpga_host.cpp    |  23 +
 SYCL/Basic/fpga_tests/fpga_aocx.cpp           |  24 +
 SYCL/Basic/fpga_tests/fpga_aocx_win.cpp       |  24 +
 SYCL/Basic/fpga_tests/fpga_io_pipes.cpp       | 134 ++++
 SYCL/Basic/fpga_tests/fpga_pipes.cpp          | 326 ++++++++
 .../Basic/fpga_tests/fpga_pipes_legacy_ns.cpp |  63 ++
 SYCL/Basic/fpga_tests/fpga_queue.cpp          | 168 +++++
 .../global_fpga_device_selector.cpp           |  18 +
 SYCL/Basic/fpga_tests/io_pipe_def.h           |  12 +
 SYCL/Basic/fpga_tests/pipes_info.cpp          |  36 +
 SYCL/Basic/functor/kernel_functor.cpp         | 180 +++++
 SYCL/Basic/group-algorithm/all_of.cpp         |  77 ++
 SYCL/Basic/group-algorithm/any_of.cpp         |  79 ++
 SYCL/Basic/group-algorithm/broadcast.cpp      |  65 ++
 SYCL/Basic/group-algorithm/exclusive_scan.cpp | 147 ++++
 SYCL/Basic/group-algorithm/inclusive_scan.cpp | 147 ++++
 SYCL/Basic/group-algorithm/leader.cpp         |  50 ++
 SYCL/Basic/group-algorithm/none_of.cpp        |  77 ++
 SYCL/Basic/group-algorithm/reduce.cpp         |  85 +++
 SYCL/Basic/helpers.hpp                        |  76 ++
 .../host-task-dependency.cpp                  | 200 +++++
 .../host-task-two-queues.cpp                  |  82 ++
 SYCL/Basic/lit.cfg.py                         | 210 ++++++
 SYCL/Basic/lit.site.cfg.py.in                 |  29 +
 SYCL/Basic/spec_const/spec_const_hw.cpp       | 121 +++
 SYCL/Basic/spec_const/spec_const_redefine.cpp | 112 +++
 .../struct_param/non-standard-layout.cpp      |  45 ++
 .../struct_param/struct_kernel_param.cpp      | 137 ++++
 SYCL/Basic/sub_group/attributes.cpp           | 125 ++++
 SYCL/Basic/sub_group/barrier.cpp              |  90 +++
 SYCL/Basic/sub_group/broadcast.cpp            |  87 +++
 SYCL/Basic/sub_group/common.cpp               |  93 +++
 SYCL/Basic/sub_group/common_ocl.cpp           | 111 +++
 SYCL/Basic/sub_group/helper.hpp               | 157 ++++
 SYCL/Basic/sub_group/info.cpp                 |  93 +++
 SYCL/Basic/sub_group/load_store.cpp           | 205 +++++
 SYCL/Basic/sub_group/reduce.cpp               | 125 ++++
 SYCL/Basic/sub_group/scan.cpp                 | 160 ++++
 SYCL/Basic/sub_group/sg.cl                    |  25 +
 SYCL/Basic/sub_group/shuffle.cpp              | 265 +++++++
 SYCL/Basic/sub_group/vote.cpp                 |  89 +++
 SYCL/Basic/usm/allocator_vector.cpp           | 130 ++++
 SYCL/Basic/usm/allocator_vector_fail.cpp      |  48 ++
 SYCL/Basic/usm/allocatorll.cpp                |  88 +++
 SYCL/Basic/usm/badmalloc.cpp                  |  78 ++
 SYCL/Basic/usm/depends_on.cpp                 |  86 +++
 SYCL/Basic/usm/dmemll.cpp                     |  93 +++
 SYCL/Basic/usm/dmemllaligned.cpp              |  90 +++
 SYCL/Basic/usm/findplatforms.hpp              |  45 ++
 SYCL/Basic/usm/hmemll.cpp                     |  86 +++
 SYCL/Basic/usm/hmemllaligned.cpp              |  82 ++
 SYCL/Basic/usm/math.cpp                       | 134 ++++
 SYCL/Basic/usm/memadvise.cpp                  |  87 +++
 SYCL/Basic/usm/memcpy.cpp                     |  63 ++
 SYCL/Basic/usm/memset.cpp                     |  59 ++
 SYCL/Basic/usm/mixed.cpp                      |  79 ++
 SYCL/Basic/usm/mixed2.cpp                     |  79 ++
 SYCL/Basic/usm/mixed2template.cpp             |  92 +++
 SYCL/Basic/usm/mixed_queue.cpp                | 108 +++
 SYCL/Basic/usm/multictxt.cpp                  |  66 ++
 SYCL/Basic/usm/pfor_flatten.cpp               |  71 ++
 SYCL/Basic/usm/pointer_query.cpp              | 123 +++
 SYCL/Basic/usm/prefetch.cpp                   |  69 ++
 SYCL/Basic/usm/queue_wait.cpp                 |  48 ++
 SYCL/Basic/usm/smemll.cpp                     |  86 +++
 SYCL/Basic/usm/smemllaligned.cpp              |  83 +++
 SYCL/CMakeLists.txt                           |   5 +
 SYCL/README.md                                |   7 +
 cmake/caches/clang_fsycl.cmake                |   4 +
 cmake/caches/clang_fsycl_cuda.cmake           |   4 +
 cmake/caches/dpcpp.cmake                      |   5 +
 140 files changed, 14289 insertions(+)
 create mode 100644 SYCL/Basic/CMakeLists.txt
 create mode 100644 SYCL/Basic/README.md
 create mode 100644 SYCL/Basic/aot/Inputs/aot.cpp
 create mode 100644 SYCL/Basic/aot/accelerator.cpp
 create mode 100644 SYCL/Basic/aot/cpu.cpp
 create mode 100644 SYCL/Basic/aot/gpu.cpp
 create mode 100644 SYCL/Basic/aot/spec_const_aot.cpp
 create mode 100644 SYCL/Basic/aot/with-llvm-bc.cpp
 create mode 100644 SYCL/Basic/bit_cast/bit_cast.cpp
 create mode 100644 SYCL/Basic/built-ins/nan.cpp
 create mode 100644 SYCL/Basic/built-ins/printf.cpp
 create mode 100644 SYCL/Basic/built-ins/scalar_common.cpp
 create mode 100644 SYCL/Basic/built-ins/scalar_geometric.cpp
 create mode 100644 SYCL/Basic/built-ins/scalar_integer.cpp
 create mode 100644 SYCL/Basic/built-ins/scalar_math.cpp
 create mode 100644 SYCL/Basic/built-ins/scalar_math_2.cpp
 create mode 100644 SYCL/Basic/built-ins/scalar_relational.cpp
 create mode 100644 SYCL/Basic/built-ins/vector_common.cpp
 create mode 100644 SYCL/Basic/built-ins/vector_geometric.cpp
 create mode 100644 SYCL/Basic/built-ins/vector_integer.cpp
 create mode 100644 SYCL/Basic/built-ins/vector_math.cpp
 create mode 100644 SYCL/Basic/built-ins/vector_relational.cpp
 create mode 100644 SYCL/Basic/config/allowlist.cpp
 create mode 100644 SYCL/Basic/config/config.cpp
 create mode 100644 SYCL/Basic/device-code-split/Inputs/split-per-source-second-file.cpp
 create mode 100644 SYCL/Basic/device-code-split/Inputs/split-per-source.h
 create mode 100644 SYCL/Basic/device-code-split/aot-accelerator.cpp
 create mode 100644 SYCL/Basic/device-code-split/aot-cpu.cpp
 create mode 100644 SYCL/Basic/device-code-split/aot-gpu.cpp
 create mode 100644 SYCL/Basic/device-code-split/split-per-kernel.cpp
 create mode 100644 SYCL/Basic/device-code-split/split-per-source-main.cpp
 create mode 100644 SYCL/Basic/devicelib/assert-windows.cpp
 create mode 100644 SYCL/Basic/devicelib/assert.cpp
 create mode 100644 SYCL/Basic/devicelib/c99_complex_math_fp64_test.cpp
 create mode 100644 SYCL/Basic/devicelib/c99_complex_math_test.cpp
 create mode 100644 SYCL/Basic/devicelib/cmath_fp64_test.cpp
 create mode 100644 SYCL/Basic/devicelib/cmath_test.cpp
 create mode 100644 SYCL/Basic/devicelib/math_fp64_test.cpp
 create mode 100644 SYCL/Basic/devicelib/math_fp64_windows_test.cpp
 create mode 100644 SYCL/Basic/devicelib/math_override_test.cpp
 create mode 100644 SYCL/Basic/devicelib/math_test.cpp
 create mode 100644 SYCL/Basic/devicelib/math_utils.hpp
 create mode 100644 SYCL/Basic/devicelib/math_windows_test.cpp
 create mode 100644 SYCL/Basic/devicelib/std_complex_math_fp64_test.cpp
 create mode 100644 SYCL/Basic/devicelib/std_complex_math_test.cpp
 create mode 100644 SYCL/Basic/enqueue_barrier/enqueue_barrier.cpp
 create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_16_empty.cpp
 create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_16_matrix_mult.cpp
 create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_16_no_input_int.cpp
 create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_16_no_opts.cpp
 create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_8_empty.cpp
 create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_8_no_input_int.cpp
 create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_arbitrary_ops_order.cpp
 create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_decl_in_scope.cpp
 create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_float_add.cpp
 create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_float_imm_arg.cpp
 create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_float_neg.cpp
 create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_imm_arg.cpp
 create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_mul.cpp
 create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_multiple_instructions.cpp
 create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_no_operands.cpp
 create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_no_output.cpp
 create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_plus_mod.cpp
 create mode 100644 SYCL/Basic/feature-tests/inline-asm/include/asmhelper.h
 create mode 100644 SYCL/Basic/feature-tests/inline-asm/letter_example.cpp
 create mode 100644 SYCL/Basic/feature-tests/inline-asm/malloc_shared_32.cpp
 create mode 100644 SYCL/Basic/feature-tests/inline-asm/malloc_shared_in_out_dif.cpp
 create mode 100644 SYCL/Basic/feature-tests/inline-asm/malloc_shared_no_input.cpp
 create mode 100644 SYCL/Basic/fpga_tests/Inputs/fpga_device.cpp
 create mode 100644 SYCL/Basic/fpga_tests/Inputs/fpga_host.cpp
 create mode 100644 SYCL/Basic/fpga_tests/fpga_aocx.cpp
 create mode 100644 SYCL/Basic/fpga_tests/fpga_aocx_win.cpp
 create mode 100644 SYCL/Basic/fpga_tests/fpga_io_pipes.cpp
 create mode 100644 SYCL/Basic/fpga_tests/fpga_pipes.cpp
 create mode 100644 SYCL/Basic/fpga_tests/fpga_pipes_legacy_ns.cpp
 create mode 100644 SYCL/Basic/fpga_tests/fpga_queue.cpp
 create mode 100644 SYCL/Basic/fpga_tests/global_fpga_device_selector.cpp
 create mode 100644 SYCL/Basic/fpga_tests/io_pipe_def.h
 create mode 100644 SYCL/Basic/fpga_tests/pipes_info.cpp
 create mode 100644 SYCL/Basic/functor/kernel_functor.cpp
 create mode 100644 SYCL/Basic/group-algorithm/all_of.cpp
 create mode 100644 SYCL/Basic/group-algorithm/any_of.cpp
 create mode 100644 SYCL/Basic/group-algorithm/broadcast.cpp
 create mode 100644 SYCL/Basic/group-algorithm/exclusive_scan.cpp
 create mode 100644 SYCL/Basic/group-algorithm/inclusive_scan.cpp
 create mode 100644 SYCL/Basic/group-algorithm/leader.cpp
 create mode 100644 SYCL/Basic/group-algorithm/none_of.cpp
 create mode 100644 SYCL/Basic/group-algorithm/reduce.cpp
 create mode 100644 SYCL/Basic/helpers.hpp
 create mode 100644 SYCL/Basic/host-interop-task/host-task-dependency.cpp
 create mode 100644 SYCL/Basic/host-interop-task/host-task-two-queues.cpp
 create mode 100644 SYCL/Basic/lit.cfg.py
 create mode 100644 SYCL/Basic/lit.site.cfg.py.in
 create mode 100644 SYCL/Basic/spec_const/spec_const_hw.cpp
 create mode 100644 SYCL/Basic/spec_const/spec_const_redefine.cpp
 create mode 100644 SYCL/Basic/struct_param/non-standard-layout.cpp
 create mode 100644 SYCL/Basic/struct_param/struct_kernel_param.cpp
 create mode 100644 SYCL/Basic/sub_group/attributes.cpp
 create mode 100644 SYCL/Basic/sub_group/barrier.cpp
 create mode 100644 SYCL/Basic/sub_group/broadcast.cpp
 create mode 100644 SYCL/Basic/sub_group/common.cpp
 create mode 100644 SYCL/Basic/sub_group/common_ocl.cpp
 create mode 100644 SYCL/Basic/sub_group/helper.hpp
 create mode 100644 SYCL/Basic/sub_group/info.cpp
 create mode 100644 SYCL/Basic/sub_group/load_store.cpp
 create mode 100644 SYCL/Basic/sub_group/reduce.cpp
 create mode 100644 SYCL/Basic/sub_group/scan.cpp
 create mode 100644 SYCL/Basic/sub_group/sg.cl
 create mode 100644 SYCL/Basic/sub_group/shuffle.cpp
 create mode 100644 SYCL/Basic/sub_group/vote.cpp
 create mode 100644 SYCL/Basic/usm/allocator_vector.cpp
 create mode 100644 SYCL/Basic/usm/allocator_vector_fail.cpp
 create mode 100644 SYCL/Basic/usm/allocatorll.cpp
 create mode 100644 SYCL/Basic/usm/badmalloc.cpp
 create mode 100644 SYCL/Basic/usm/depends_on.cpp
 create mode 100644 SYCL/Basic/usm/dmemll.cpp
 create mode 100644 SYCL/Basic/usm/dmemllaligned.cpp
 create mode 100644 SYCL/Basic/usm/findplatforms.hpp
 create mode 100644 SYCL/Basic/usm/hmemll.cpp
 create mode 100644 SYCL/Basic/usm/hmemllaligned.cpp
 create mode 100644 SYCL/Basic/usm/math.cpp
 create mode 100644 SYCL/Basic/usm/memadvise.cpp
 create mode 100644 SYCL/Basic/usm/memcpy.cpp
 create mode 100644 SYCL/Basic/usm/memset.cpp
 create mode 100644 SYCL/Basic/usm/mixed.cpp
 create mode 100644 SYCL/Basic/usm/mixed2.cpp
 create mode 100644 SYCL/Basic/usm/mixed2template.cpp
 create mode 100644 SYCL/Basic/usm/mixed_queue.cpp
 create mode 100644 SYCL/Basic/usm/multictxt.cpp
 create mode 100644 SYCL/Basic/usm/pfor_flatten.cpp
 create mode 100644 SYCL/Basic/usm/pointer_query.cpp
 create mode 100644 SYCL/Basic/usm/prefetch.cpp
 create mode 100644 SYCL/Basic/usm/queue_wait.cpp
 create mode 100644 SYCL/Basic/usm/smemll.cpp
 create mode 100644 SYCL/Basic/usm/smemllaligned.cpp
 create mode 100644 SYCL/CMakeLists.txt
 create mode 100644 SYCL/README.md
 create mode 100644 cmake/caches/clang_fsycl.cmake
 create mode 100644 cmake/caches/clang_fsycl_cuda.cmake
 create mode 100644 cmake/caches/dpcpp.cmake

diff --git a/SYCL/Basic/CMakeLists.txt b/SYCL/Basic/CMakeLists.txt
new file mode 100644
index 0000000000..be67381e43
--- /dev/null
+++ b/SYCL/Basic/CMakeLists.txt
@@ -0,0 +1,16 @@
+set(LLVM_TOOLS_DIR "${LLVM_BINARY_DIR}/bin/")
+
+#get_target_property(SYCL_BINARY_DIR sycl-toolchain BINARY_DIR)
+
+set(SYCL_INCLUDE "${SYCL_INCLUDE_BUILD_DIR}")
+set(SYCL_TOOLS_SRC_DIR "${PROJECT_SOURCE_DIR}/tools/")
+set(LLVM_BUILD_BINARY_DIRS "${LLVM_BINARY_DIR}/bin/")
+set(LLVM_BUILD_LIBRARY_DIRS "${LLVM_BINARY_DIR}/lib/")
+
+set(RT_TEST_ARGS ${RT_TEST_ARGS} "-v")
+set(DEPLOY_RT_TEST_ARGS ${DEPLOY_RT_TEST_ARGS} "-v -D SYCL_TOOLS_DIR=${CMAKE_INSTALL_PREFIX}/bin -D SYCL_LIBS_DIR=${CMAKE_INSTALL_PREFIX}/lib${LLVM_LIBDIR_SUFFIX} -D SYCL_INCLUDE=${SYCL_INCLUDE_DEPLOY_DIR}")
+
+find_package(Threads REQUIRED)
+set(SYCL_THREADS_LIB ${CMAKE_THREAD_LIBS_INIT})
+
+configure_file("${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in" "${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg")
diff --git a/SYCL/Basic/README.md b/SYCL/Basic/README.md
new file mode 100644
index 0000000000..135973162c
--- /dev/null
+++ b/SYCL/Basic/README.md
@@ -0,0 +1,90 @@
+# Overview
+SYCL related test based on SYCL-LIT. These tests support
+execution on all supported devices and SYCL backends.
+
+# Table of contents
+ * [Execution](#execution)
+ * [Main parameters](#main-parameters)
+ * [LIT features which can be used to configure test execution](#lit-features-which-can-be-used-to-configure-test-execution)
+
+# Execution
+```
+git clone <GIT_REPO> # e.g. https://github.com/vladimirlaz/llvm-test-suite
+cd llvm-test-suite
+mkdir build
+cd build
+# configuring test execution (selecting compiler version, target BE and target device)
+cmake -G Ninja -DTEST_SUITE_SUBDIRS=SYCL -DTEST_SUITE_LIT=<PATH_TO_llvm-lit> -DSYCL_BE=<SYCL_BE> -DSYCL_TARGET_DEVICES=<TARGET_DEVICES> -C<CMAKE_CHASHED_CONFIG> ..
+# Building full list of tests in subdir
+ninja check
+# or
+llvm-lit .
+# Get list of available tests
+llvm-lit . --show-tests
+# Run specific test
+llvm-lit <path_to_test>
+```
+
+Notes:
+ - it is assumed that LIT framework, FileCheck and other LIT dependencies are available in the same directory with llvm-lit.
+ - compiler variant as well as compile/link options are defined in cashed cmake configurations:
+   - [dpcpp.cmake](../../cmake/caches/dpcpp.cmake)
+   - [clang_fsycl.cmake](../../cmake/cashes/clang_fsycl.cmake)
+   - [clang_fsycl_cuda.cmake](../../cmake/cashes/clang_fsycl_cuda.cmake)
+ - compiler is taken from environment.
+
+# Main parameters
+It is possible to change tets scope my specifying test directory/file in first
+argument to for thelit-runner.py script.
+
+***SYCL_TARGET_DEVICES*** should point to the directory containing DPCPP compiler
+
+***SYCL_TARGET_DEVICES*** defines comma separated target device types (default value is
+ cpu,gpu,acc,host). Supported target_devices values are:
+ - **cpu**  - CPU device available in OpenCL backend only;
+ - **gpu**  - GPU device available in OpenCL, Level Zero and CUDA backends;
+ - **acc**  - FPGA emulator device available in OpenCL backend only;
+ - **host** - SYCL Host device availabel with all backends.
+
+***SYCL_BE*** defined SYCL backend to be used for testing (default is PI_OPENCL).
+Supported sycl_be values:
+ - PI_OPENCL - for OpenCL backend;
+ - PI_CUDA - for CUDA backend;
+ - PI_LEVEL0 - Level Zero backend.
+
+It is asssumed that all dependencies (OpenCL runtimes,
+CUDA SDK, AOT compilers, etc) are available in the system.
+
+See examples below for configuring tests targetting different devices:
+ - SYCL host:
+```
+cmake -G Ninja  -DTEST_SUITE_COLLECT_CODE_SIZE=OFF  -DTEST_SUITE_COLLECT_COMPILE_TIME=OFF -DTEST_SUITE_SUBDIRS=SYCL  -DTEST_SUITE_LIT=<PATH_TO_llvm-lit> -DSYCL_BE=PI_OPENCL -DSYCL_TARGET_DEVICES="host" -C../cmake/caches/clang_fsycl.cmake  ..
+```
+ - OpenCL GPU
+```
+cmake -G Ninja  -DTEST_SUITE_COLLECT_CODE_SIZE=OFF  -DTEST_SUITE_COLLECT_COMPILE_TIME=OFF -DTEST_SUITE_SUBDIRS=SYCL  -DTEST_SUITE_LIT=<PATH_TO_llvm-lit> -DSYCL_BE=PI_OPENCL -DSYCL_TARGET_DEVICES="gpu" -C../cmake/caches/clang_fsycl.cmake  ..
+```
+ - OpenCL CPU
+```
+cmake -G Ninja  -DTEST_SUITE_COLLECT_CODE_SIZE=OFF  -DTEST_SUITE_COLLECT_COMPILE_TIME=OFF -DTEST_SUITE_SUBDIRS=SYCL  -DTEST_SUITE_LIT=<PATH_TO_llvm-lit> -DSYCL_BE=PI_OPENCL -DSYCL_TARGET_DEVICES="gpu" -C../cmake/caches/clang_fsycl.cmake  ..
+```
+ - OpenCL FPGA emulator
+```
+cmake -G Ninja  -DTEST_SUITE_COLLECT_CODE_SIZE=OFF  -DTEST_SUITE_COLLECT_COMPILE_TIME=OFF -DTEST_SUITE_SUBDIRS=SYCL  -DTEST_SUITE_LIT=<PATH_TO_llvm-lit> -DSYCL_BE=PI_OPENCL -DSYCL_TARGET_DEVICES="gpu" -C../cmake/caches/clang_fsycl.cmake  ..
+```
+ - CUDA GPU
+```
+cmake -G Ninja  -DTEST_SUITE_COLLECT_CODE_SIZE=OFF  -DTEST_SUITE_COLLECT_COMPILE_TIME=OFF -DTEST_SUITE_SUBDIRS=SYCL  -DTEST_SUITE_LIT=<PATH_TO_llvm-lit> -DSYCL_BE=PI_CUDA -DSYCL_TARGET_DEVICES="gpu" -C../cmake/caches/clang_fsycl_cuda.cmake  ..
+```
+ - Level Zero GPU
+```
+cmake -G Ninja  -DTEST_SUITE_COLLECT_CODE_SIZE=OFF  -DTEST_SUITE_COLLECT_COMPILE_TIME=OFF -DTEST_SUITE_SUBDIRS=SYCL  -DTEST_SUITE_LIT=<PATH_TO_llvm-lit> -DSYCL_BE=PI_LEVEL0 -DSYCL_TARGET_DEVICES="gpu" -C../cmake/caches/clang_fsycl.cmake  ..
+```
+
+# LIT features which can be used to configure test execution:
+ - **windows**, **linux** - host OS;
+ - **cpu**, **gpu**, **host**, **acc** - target devices;
+ - **cuda**, **opencl**, **level0** - target backend;
+ - **sycl-ls** - sycl-ls tool is available;
+ - **dump_ir**: is set to true if compiler supports dumiping IR. Can be set by setting DUMP_IR_SUPPORTED in cmake. Default is false.
+
diff --git a/SYCL/Basic/aot/Inputs/aot.cpp b/SYCL/Basic/aot/Inputs/aot.cpp
new file mode 100644
index 0000000000..46f768dfa5
--- /dev/null
+++ b/SYCL/Basic/aot/Inputs/aot.cpp
@@ -0,0 +1,76 @@
+//==----- aot.cpp - Simple vector addition (AOT compilation example)  --------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===------------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+#include <array>
+#include <iostream>
+
+constexpr cl::sycl::access::mode sycl_read = cl::sycl::access::mode::read;
+constexpr cl::sycl::access::mode sycl_write = cl::sycl::access::mode::write;
+
+template <typename T>
+class SimpleVadd;
+
+template <typename T, size_t N>
+void simple_vadd(const std::array<T, N> &VA, const std::array<T, N> &VB,
+                 std::array<T, N> &VC) {
+  cl::sycl::queue deviceQueue([](cl::sycl::exception_list ExceptionList) {
+    for (cl::sycl::exception_ptr_class ExceptionPtr : ExceptionList) {
+      try {
+        std::rethrow_exception(ExceptionPtr);
+      } catch (cl::sycl::exception &E) {
+        std::cerr << E.what();
+      } catch (...) {
+        std::cerr << "Unknown async exception was caught." << std::endl;
+      }
+    }
+  });
+
+  cl::sycl::range<1> numOfItems{N};
+  cl::sycl::buffer<T, 1> bufferA(VA.data(), numOfItems);
+  cl::sycl::buffer<T, 1> bufferB(VB.data(), numOfItems);
+  cl::sycl::buffer<T, 1> bufferC(VC.data(), numOfItems);
+
+  deviceQueue.submit([&](cl::sycl::handler &cgh) {
+    auto accessorA = bufferA.template get_access<sycl_read>(cgh);
+    auto accessorB = bufferB.template get_access<sycl_read>(cgh);
+    auto accessorC = bufferC.template get_access<sycl_write>(cgh);
+
+    cgh.parallel_for<class SimpleVadd<T>>(numOfItems,
+    [=](cl::sycl::id<1> wiID) {
+      accessorC[wiID] = accessorA[wiID] + accessorB[wiID];
+    });
+  });
+
+  deviceQueue.wait_and_throw();
+}
+
+int main() {
+  const size_t array_size = 4;
+  std::array<cl::sycl::cl_int, array_size> A = {{1, 2, 3, 4}},
+                                           B = {{1, 2, 3, 4}}, C;
+  std::array<cl::sycl::cl_float, array_size> D = {{1.f, 2.f, 3.f, 4.f}},
+                                             E = {{1.f, 2.f, 3.f, 4.f}}, F;
+  simple_vadd(A, B, C);
+  simple_vadd(D, E, F);
+  for (unsigned int i = 0; i < array_size; i++) {
+    if (C[i] != A[i] + B[i]) {
+      std::cout << "The results are incorrect (element " << i << " is " << C[i]
+                << "!\n";
+      return 1;
+    }
+    if (F[i] != D[i] + E[i]) {
+      std::cout << "The results are incorrect (element " << i << " is " << F[i]
+                << "!\n";
+      return 1;
+    }
+  }
+  std::cout << "The results are correct!\n";
+  return 0;
+}
diff --git a/SYCL/Basic/aot/accelerator.cpp b/SYCL/Basic/aot/accelerator.cpp
new file mode 100644
index 0000000000..8ebb75ac36
--- /dev/null
+++ b/SYCL/Basic/aot/accelerator.cpp
@@ -0,0 +1,13 @@
+//==----- accelerator.cpp - AOT compilation for fpga devices using aoc  ------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===------------------------------------------------------------------------===//
+
+// REQUIRES: aoc, accelerator
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_fpga-unknown-unknown-sycldevice %S/Inputs/aot.cpp -o %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// XFAIL: *
diff --git a/SYCL/Basic/aot/cpu.cpp b/SYCL/Basic/aot/cpu.cpp
new file mode 100644
index 0000000000..42ded976ff
--- /dev/null
+++ b/SYCL/Basic/aot/cpu.cpp
@@ -0,0 +1,12 @@
+//==----- cpu.cpp - AOT compilation for cpu devices using opencl-aot  --------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===------------------------------------------------------------------------===//
+
+// REQUIRES: opencl-aot, cpu
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64-unknown-unknown-sycldevice %S/Inputs/aot.cpp -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
diff --git a/SYCL/Basic/aot/gpu.cpp b/SYCL/Basic/aot/gpu.cpp
new file mode 100644
index 0000000000..482a14eade
--- /dev/null
+++ b/SYCL/Basic/aot/gpu.cpp
@@ -0,0 +1,14 @@
+//==----- gpu.cpp - AOT compilation for gen devices using GEN compiler  ------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===------------------------------------------------------------------------===//
+
+// REQUIRES: ocloc, gpu
+// UNSUPPORTED: cuda
+// CUDA is not compatible with SPIR.
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend=spir64_gen-unknown-unknown-sycldevice "-device skl" %S/Inputs/aot.cpp -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
diff --git a/SYCL/Basic/aot/spec_const_aot.cpp b/SYCL/Basic/aot/spec_const_aot.cpp
new file mode 100644
index 0000000000..99b451fe6d
--- /dev/null
+++ b/SYCL/Basic/aot/spec_const_aot.cpp
@@ -0,0 +1,66 @@
+// REQUIRES: opencl-aot, cpu
+//
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64-unknown-unknown-sycldevice %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+//
+// The test checks that the specialization constant feature works with ahead
+// of time compilation.
+
+#include <CL/sycl.hpp>
+
+#include <iostream>
+#include <vector>
+
+class MyInt32Const;
+
+using namespace sycl;
+
+class Kernel;
+
+int main(int argc, char **argv) {
+  cl::sycl::queue q(default_selector{}, [](exception_list l) {
+    for (auto ep : l) {
+      try {
+        std::rethrow_exception(ep);
+      } catch (cl::sycl::exception &e0) {
+        std::cout << e0.what();
+      } catch (std::exception &e1) {
+        std::cout << e1.what();
+      } catch (...) {
+        std::cout << "*** catch (...)\n";
+      }
+    }
+  });
+
+  std::cout << "Running on " << q.get_device().get_info<info::device::name>() << "\n";
+  cl::sycl::program prog(q.get_context());
+
+  cl::sycl::experimental::spec_constant<int32_t, MyInt32Const> i32 =
+      prog.set_spec_constant<MyInt32Const>(10);
+
+  prog.build_with_kernel_type<Kernel>();
+
+  std::vector<int> vec(1);
+  {
+    cl::sycl::buffer<int, 1> buf(vec.data(), vec.size());
+
+    q.submit([&](cl::sycl::handler &cgh) {
+      auto acc = buf.get_access<cl::sycl::access::mode::write>(cgh);
+      cgh.single_task<Kernel>(
+          prog.get_kernel<Kernel>(),
+          [=]() {
+            acc[0] = i32.get();
+          });
+    });
+  }
+  bool passed = true;
+  int val = vec[0];
+  int gold = 0; // with AOT, spec constant is set to C++ default for the type
+
+  if (val != gold) {
+    std::cout << "*** ERROR: " << val << " != " << gold << "(gold)\n";
+    passed = false;
+  }
+  std::cout << (passed ? "passed\n" : "FAILED\n");
+  return passed ? 0 : 1;
+}
diff --git a/SYCL/Basic/aot/with-llvm-bc.cpp b/SYCL/Basic/aot/with-llvm-bc.cpp
new file mode 100644
index 0000000000..79af5d5836
--- /dev/null
+++ b/SYCL/Basic/aot/with-llvm-bc.cpp
@@ -0,0 +1,17 @@
+//==----- with-llvm-bc.cpp - SYCL kernel with LLVM IR bitcode as binary ----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: cpu, dump_ir
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64-unknown-unknown-sycldevice -c %S/Inputs/aot.cpp -o %t.o
+// RUN: %clangxx -fsycl -fsycl-link-targets=spir64-unknown-unknown-sycldevice %t.o -o %t.spv
+// RUN: llvm-spirv -r %t.spv -o %t.bc
+// RUN: %clangxx -fsycl -fsycl-add-targets=spir64:%t.bc %t.o -o %t.out
+//
+// Only CPU supports LLVM IR bitcode as a binary
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
diff --git a/SYCL/Basic/bit_cast/bit_cast.cpp b/SYCL/Basic/bit_cast/bit_cast.cpp
new file mode 100644
index 0000000000..e1fe40b793
--- /dev/null
+++ b/SYCL/Basic/bit_cast/bit_cast.cpp
@@ -0,0 +1,84 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+
+#include <iostream>
+#include <math.h>
+#include <type_traits>
+
+constexpr cl::sycl::access::mode sycl_write = cl::sycl::access::mode::write;
+
+template <typename To, typename From>
+class BitCastKernel;
+
+template <typename To, typename From>
+To doBitCast(const From &ValueToConvert) {
+  std::vector<To> Vec(1);
+  {
+    sycl::buffer<To, 1> Buf(Vec.data(), 1);
+    sycl::queue Queue;
+    Queue.submit([&](sycl::handler &cgh) {
+      auto acc = Buf.template get_access<sycl_write>(cgh);
+      cgh.single_task<class BitCastKernel<To, From>>([=]() {
+        // TODO: change to sycl::bit_cast in the future
+        acc[0] = sycl::detail::bit_cast<To>(ValueToConvert);
+      });
+    });
+  }
+  return Vec[0];
+}
+
+template <typename To, typename From>
+int test(const From &Value) {
+  auto ValueConvertedTwoTimes = doBitCast<From>(doBitCast<To>(Value));
+  bool isOriginalValueEqualsToConvertedTwoTimes = false;
+  if (std::is_integral<From>::value) {
+    isOriginalValueEqualsToConvertedTwoTimes = Value == ValueConvertedTwoTimes;
+  } else if ((std::is_floating_point<From>::value) || std::is_same<From, cl::sycl::half>::value) {
+    static const float Epsilon = 0.0000001f;
+    isOriginalValueEqualsToConvertedTwoTimes = fabs(Value - ValueConvertedTwoTimes) < Epsilon;
+  } else {
+    std::cerr << "Type " << typeid(From).name() << " neither integral nor floating point nor cl::sycl::half\n";
+    return 1;
+  }
+  if (!isOriginalValueEqualsToConvertedTwoTimes) {
+    std::cerr << "FAIL: Original value which is " << Value << " != value converted two times which is " << ValueConvertedTwoTimes << "\n";
+    return 1;
+  }
+  std::cout << "PASS\n";
+  return 0;
+}
+
+int main() {
+  int ReturnCode = 0;
+
+  std::cout << "cl::sycl::half to unsigned short ...\n";
+  ReturnCode += test<unsigned short>(cl::sycl::half(1.0f));
+
+  std::cout << "unsigned short to cl::sycl::half ...\n";
+  ReturnCode += test<cl::sycl::half>(static_cast<unsigned short>(16384));
+
+  std::cout << "cl::sycl::half to short ...\n";
+  ReturnCode += test<short>(cl::sycl::half(1.0f));
+
+  std::cout << "short to cl::sycl::half ...\n";
+  ReturnCode += test<cl::sycl::half>(static_cast<short>(16384));
+
+  std::cout << "int to float ...\n";
+  ReturnCode += test<float>(static_cast<int>(2));
+
+  std::cout << "float to int ...\n";
+  ReturnCode += test<int>(static_cast<float>(-2.4f));
+
+  std::cout << "unsigned int to float ...\n";
+  ReturnCode += test<float>(static_cast<unsigned int>(6));
+
+  std::cout << "float to unsigned int ...\n";
+  ReturnCode += test<unsigned int>(static_cast<float>(-2.4f));
+
+  return ReturnCode;
+}
diff --git a/SYCL/Basic/built-ins/nan.cpp b/SYCL/Basic/built-ins/nan.cpp
new file mode 100644
index 0000000000..5c0b2c3233
--- /dev/null
+++ b/SYCL/Basic/built-ins/nan.cpp
@@ -0,0 +1,72 @@
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %clangxx -fsycl -D HALF_IS_SUPPORTED %s -o %t_gpu.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t_gpu.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// XFAIL: cuda
+#include <CL/sycl.hpp>
+
+#include <cassert>
+
+namespace s = cl::sycl;
+using namespace std;
+
+template <typename T, typename R, bool Expected = true> void test_nan_call() {
+  static_assert(is_same<decltype(s::nan(T{0})), R>::value == Expected, "");
+}
+
+template <typename, typename> struct test;
+
+template <typename T, typename R> void check_nan(s::queue &Queue) {
+  R Data{0};
+  s::vec<R, 2> VData{0};
+  {
+    s::buffer<R, 1> Buf(&Data, s::range<1>(1));
+    s::buffer<s::vec<R, 2>, 1> VBuf(&VData, s::range<1>(1));
+    Queue.submit([&](s::handler &Cgh) {
+      auto Acc = Buf.template get_access<s::access::mode::write>(Cgh);
+      auto VAcc = VBuf.template get_access<s::access::mode::write>(Cgh);
+      Cgh.single_task<test<T, R>>([=]() {
+        Acc[0] = s::nan(T{0});
+        VAcc[0] = s::nan(s::vec<T, 2>{0});
+      });
+    });
+    Queue.wait_and_throw();
+  }
+  assert(s::isnan(Data));
+  assert(s::all(s::isnan(VData)));
+}
+
+int main() {
+  test_nan_call<s::ushort, half>();
+  test_nan_call<s::uint, float>();
+  test_nan_call<s::ulong, double>();
+  test_nan_call<s::ulonglong, double>();
+  test_nan_call<s::ushort2, s::half2>();
+  test_nan_call<s::uint2, s::float2>();
+  test_nan_call<s::ulong2, s::double2>();
+  test_nan_call<s::ulonglong2, s::double2>();
+
+  s::queue Queue([](cl::sycl::exception_list ExceptionList) {
+    for (cl::sycl::exception_ptr_class ExceptionPtr : ExceptionList) {
+      try {
+        std::rethrow_exception(ExceptionPtr);
+      } catch (cl::sycl::exception &E) {
+        std::cerr << E.what() << std::endl;
+      } catch (...) {
+        std::cerr << "Unknown async exception was caught." << std::endl;
+      }
+    }
+  });
+#ifdef HALF_IS_SUPPORTED
+  if (Queue.get_device().has_extension("cl_khr_fp16"))
+    check_nan<unsigned short, half>(Queue);
+#endif
+  check_nan<unsigned int, float>(Queue);
+  if (Queue.get_device().has_extension("cl_khr_fp64")) {
+    check_nan<unsigned long, double>(Queue);
+    check_nan<unsigned long long, double>(Queue);
+  }
+  return 0;
+}
diff --git a/SYCL/Basic/built-ins/printf.cpp b/SYCL/Basic/built-ins/printf.cpp
new file mode 100644
index 0000000000..88d4e36b02
--- /dev/null
+++ b/SYCL/Basic/built-ins/printf.cpp
@@ -0,0 +1,134 @@
+// UNSUPPORTED: cuda
+// CUDA does not support printf.
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out | FileCheck %s
+// RUN: %CPU_RUN_PLACEHOLDER %t.out %CPU_CHECK_PLACEHOLDER
+// RUN: %GPU_RUN_PLACEHOLDER %t.out %GPU_CHECK_PLACEHOLDER
+// RUN: %ACC_RUN_PLACEHOLDER %t.out %ACC_CHECK_PLACEHOLDER
+// XFAIL: cpu, accelerator
+#include <CL/sycl.hpp>
+
+#include <cstdint>
+#include <iostream>
+
+using namespace cl::sycl;
+
+// According to OpenCL C spec, the format string must be in constant address
+// space
+#ifdef __SYCL_DEVICE_ONLY__
+#define CONSTANT __attribute__((opencl_constant))
+#else
+#define CONSTANT
+#endif
+
+// This is one of the possible ways to define a format string in a correct
+// address space
+static const CONSTANT char format_hello_world[] = "Hello, World!\n";
+
+// Static isn't really needed if you define it in global scope
+const CONSTANT char format_int[] = "%d\n";
+
+static const CONSTANT char format_vec[] = "%d,%d,%d,%d\n";
+
+const CONSTANT char format_hello_world_2[] = "%lu: Hello, World!\n";
+
+int main() {
+  {
+    default_selector Selector;
+    queue Queue(Selector);
+
+    Queue.submit([&](handler &CGH) {
+      CGH.single_task<class integral>([=]() {
+        // String
+        intel::experimental::printf(format_hello_world);
+        // Due to a bug in Intel CPU Runtime for OpenCL on Windows, information
+        // printed using such format strings (without %-specifiers) might
+        // appear in different order if output is redirected to a file or
+        // another app
+        // FIXME: strictly check output order once the bug is fixed
+        // CHECK: {{(Hello, World!)?}}
+
+        // Integral types
+        intel::experimental::printf(format_int, (int32_t)123);
+        intel::experimental::printf(format_int, (int32_t)-123);
+        // CHECK: 123
+        // CHECK-NEXT: -123
+
+        // Floating point types
+        {
+          // You can declare format string in non-global scope, but in this case
+          // static keyword is required
+          static const CONSTANT char format[] = "%f\n";
+          intel::experimental::printf(format, 33.4f);
+          intel::experimental::printf(format, -33.4f);
+        }
+        // CHECK-NEXT: 33.4
+        // CHECK-NEXT: -33.4
+
+        // Vectors
+        cl::sycl::vec<int, 4> v4{5, 6, 7, 8};
+#ifdef __SYCL_DEVICE_ONLY__
+        // On device side, vectors can be printed via native OpenCL types:
+        using ocl_int4 = cl::sycl::vec<int, 4>::vector_t;
+        {
+          static const CONSTANT char format[] = "%v4d\n";
+          intel::experimental::printf(format, (ocl_int4)v4);
+        }
+
+        // However, you are still able to print them by-element:
+        {
+          intel::experimental::printf(format_vec, (int32_t)v4.w(),
+                                      (int32_t)v4.z(), (int32_t)v4.y(),
+                                      (int32_t)v4.x());
+        }
+#else
+        // On host side you always have to print them by-element:
+        intel::experimental::printf(format_vec, (int32_t)v4.x(),
+                                    (int32_t)v4.y(), (int32_t)v4.z(),
+                                    (int32_t)v4.w());
+        intel::experimental::printf(format_vec, (int32_t)v4.w(),
+                                    (int32_t)v4.z(), (int32_t)v4.y(),
+                                    (int32_t)v4.x());
+#endif // __SYCL_DEVICE_ONLY__
+       // CHECK-NEXT: 5,6,7,8
+       // CHECK-NEXT: 8,7,6,5
+
+        // Pointers
+        int a = 5;
+        int *Ptr = &a;
+        // According to OpenCL spec, argument should be a void pointer
+        {
+          static const CONSTANT char format[] = "%p\n";
+          intel::experimental::printf(format, (void *)Ptr);
+        }
+        // CHECK-NEXT: {{(0x)?[0-9a-fA-F]+$}}
+      });
+    });
+    Queue.wait();
+
+    // printf in parallel_for
+    Queue.submit([&](handler &CGH) {
+      CGH.parallel_for<class stream_string>(range<1>(10), [=](id<1> i) {
+        // cast to uint64_t to be sure that we pass 64-bit unsigned value
+        intel::experimental::printf(format_hello_world_2, (uint64_t)i.get(0));
+      });
+    });
+    Queue.wait();
+    // CHECK-NEXT: {{[0-9]+}}: Hello, World!
+    // CHECK-NEXT: {{[0-9]+}}: Hello, World!
+    // CHECK-NEXT: {{[0-9]+}}: Hello, World!
+    // CHECK-NEXT: {{[0-9]+}}: Hello, World!
+    // CHECK-NEXT: {{[0-9]+}}: Hello, World!
+    // CHECK-NEXT: {{[0-9]+}}: Hello, World!
+    // CHECK-NEXT: {{[0-9]+}}: Hello, World!
+    // CHECK-NEXT: {{[0-9]+}}: Hello, World!
+    // CHECK-NEXT: {{[0-9]+}}: Hello, World!
+    // CHECK-NEXT: {{[0-9]+}}: Hello, World!
+  }
+
+// FIXME: strictly check output order once the bug mentioned above is fixed
+// CHECK: {{(Hello, World!)?}}
+
+  return 0;
+}
diff --git a/SYCL/Basic/built-ins/scalar_common.cpp b/SYCL/Basic/built-ins/scalar_common.cpp
new file mode 100644
index 0000000000..89abf11c1a
--- /dev/null
+++ b/SYCL/Basic/built-ins/scalar_common.cpp
@@ -0,0 +1,34 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// TODO: ptxas fatal   : Unresolved extern function '_Z23__spirv_ocl_fmax_commonff'
+// XFAIL: cuda
+
+#include <CL/sycl.hpp>
+
+#include <cassert>
+
+namespace s = cl::sycl;
+
+int main() {
+  // max
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class maxF1F1>([=]() {
+          AccR[0] = s::max(s::cl_float{ 0.5f }, s::cl_float{ 2.3f });
+        });
+      });
+    }
+    assert(r == 2.3f);
+  }
+
+  return 0;
+}
diff --git a/SYCL/Basic/built-ins/scalar_geometric.cpp b/SYCL/Basic/built-ins/scalar_geometric.cpp
new file mode 100644
index 0000000000..c63dcbbfc6
--- /dev/null
+++ b/SYCL/Basic/built-ins/scalar_geometric.cpp
@@ -0,0 +1,131 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// TODO: ptxas fatal   : Unresolved extern function '_Z12__spirv_FMulff'
+// XFAIL: cuda
+
+#include <CL/sycl.hpp>
+
+#include <cassert>
+
+namespace s = cl::sycl;
+
+int main() {
+  // dot
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class dotF1F1>([=]() {
+          AccR[0] = s::dot(s::cl_float{ 0.5 }, s::cl_float{ 1.6 });
+        });
+      });
+    }
+    assert(r == 0.8f);
+  }
+
+  // distance
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class distanceF1>([=]() {
+          AccR[0] = s::distance(s::cl_float{ 1.f }, s::cl_float{ 3.f });
+        });
+      });
+    }
+    assert(r == 2.f);
+  }
+
+  // length
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class lengthF1>([=]() {
+          AccR[0] = s::length(s::cl_float{ 1.f });
+        });
+      });
+    }
+    assert(r == 1.f);
+  }
+
+  // normalize
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class normalizeF1>([=]() {
+          AccR[0] = s::normalize(s::cl_float{ 2.f });
+        });
+      });
+    }
+    assert(r == 1.f);
+  }
+
+  // fast_distance
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class fast_distanceF1>([=]() {
+          AccR[0] = s::fast_distance(s::cl_float{ 1.f }, s::cl_float{ 3.f });
+        });
+      });
+    }
+    assert(r == 2.f);
+  }
+
+  // fast_length
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class fast_lengthF1>([=]() {
+          AccR[0] = s::fast_length(s::cl_float{ 2.f });
+        });
+      });
+    }
+    assert(r == 2.f);
+  }
+
+  // fast_normalize
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class fast_normalizeF1>([=]() {
+          AccR[0] = s::fast_normalize(s::cl_float{ 2.f });
+        });
+      });
+    }
+
+    assert(r == 1.f);
+  }
+
+  return 0;
+}
diff --git a/SYCL/Basic/built-ins/scalar_integer.cpp b/SYCL/Basic/built-ins/scalar_integer.cpp
new file mode 100644
index 0000000000..6a53654fb4
--- /dev/null
+++ b/SYCL/Basic/built-ins/scalar_integer.cpp
@@ -0,0 +1,571 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// TODO: ptxas fatal   : Unresolved extern function '_Z17__spirv_ocl_s_maxii'
+// XFAIL: cuda
+
+#include <CL/sycl.hpp>
+
+#include <array>
+#include <cassert>
+
+namespace s = cl::sycl;
+
+int main() {
+  // max
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class maxSI1SI1>([=]() {
+          AccR[0] = s::max(s::cl_int{ 5 }, s::cl_int{ 2 });
+        });
+      });
+    }
+    assert(r == 5);
+  }
+
+  // max
+  {
+    s::cl_uint r{ 0 };
+    {
+      s::buffer<s::cl_uint, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class maxUI1UI1>([=]() {
+          AccR[0] = s::max(s::cl_uint{ 5 }, s::cl_uint{ 2 });
+        });
+      });
+    }
+    assert(r == 5);
+  }
+
+  // min
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class minSI1SI1>([=]() {
+          AccR[0] = s::min(s::cl_int{ 5 }, s::cl_int{ 2 });
+        });
+      });
+    }
+    assert(r == 2);
+  }
+
+  // min (longlong)
+  {
+    s::longlong r{ 0 };
+    {
+      s::buffer<s::longlong, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class minSLL1SLL1>([=]() {
+          AccR[0] = s::min(s::longlong{ 5 }, s::longlong{ 2 });
+        });
+      });
+    }
+    assert(r == 2);
+  }
+
+  // min
+  {
+    s::cl_uint r{ 0 };
+    {
+      s::buffer<s::cl_uint, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class minUI1UI1>([=]() {
+          AccR[0] = s::min(s::cl_uint{ 5 }, s::cl_uint{ 2 });
+        });
+      });
+    }
+    assert(r == 2);
+  }
+
+  // min (ulonglong)
+  {
+    s::ulonglong r{ 0 };
+    {
+      s::buffer<s::ulonglong, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class minULL1ULL1>([=]() {
+          AccR[0] = s::min(s::ulonglong{ 5 }, s::ulonglong{ 2 });
+        });
+      });
+    }
+    assert(r == 2);
+  }
+
+  // abs
+  {
+    s::cl_uint r{ 0 };
+    {
+      s::buffer<s::cl_uint, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class absSI1>([=]() {
+          AccR[0] = s::abs(s::cl_int{ -5 });
+        });
+      });
+    }
+    assert(r == 5);
+  }
+
+  // abs_diff
+  {
+    s::cl_uint r{ 0 };
+    {
+      s::buffer<s::cl_uint, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class abs_diffSI1SI1>([=]() {
+          AccR[0] = s::abs_diff(s::cl_int{ -5 }, s::cl_int{ -1 });
+        });
+      });
+    }
+    assert(r == 4);
+  }
+
+  // abs_diff(uchar)
+  {
+    s::cl_uchar r{ 0 };
+    {
+      s::buffer<s::cl_uchar, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class abs_diffUC1UC1>([=]() {
+          AccR[0] = s::abs_diff(s::uchar{ 3 }, s::uchar{ 250 });
+        });
+      });
+    }
+    assert(r == 247);
+  }
+
+  // add_sat
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class add_satSI1SI1>([=]() {
+          AccR[0] = s::add_sat(s::cl_int{ 0x7FFFFFFF }, s::cl_int{ 100 });
+        });
+      });
+    }
+    assert(r == 0x7FFFFFFF);
+  }
+
+  // hadd
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class haddSI1SI1>([=]() {
+          AccR[0] = s::hadd(s::cl_int{ 0x0000007F }, s::cl_int{ 0x00000020 });
+        });
+      });
+    }
+    assert(r == 0x0000004F);
+  }
+
+  // rhadd
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class rhaddSI1SI1>([=]() {
+          AccR[0] = s::rhadd(s::cl_int{ 0x0000007F }, s::cl_int{ 0x00000020 });
+        });
+      });
+    }
+    assert(r == 0x50);
+  }
+
+  // clamp
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class clampSI1SI1SI1>([=]() {
+          AccR[0] = s::clamp(s::cl_int{ 5 }, s::cl_int{ 10 }, s::cl_int{ 30 });
+        });
+      });
+    }
+    assert(r == 10);
+  }
+
+  // clz
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class clzSI1>([=]() {
+          AccR[0] = s::clz(s::cl_int{ 0x0FFFFFFF });
+        });
+      });
+    }
+    assert(r == 4);
+  }
+
+  // ctz
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class ctzSI1>([=]() {
+          AccR[0] = s::intel::ctz(s::cl_int{ 0x7FFFFFF0 });
+        });
+      });
+    }
+    assert(r == 4);
+  }
+
+  // mad_hi
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class mad_hiSI1SI1SI1>([=]() {
+          AccR[0] = s::mad_hi(s::cl_int{ 0x10000000 }, s::cl_int{ 0x00000100 },
+                              s::cl_int{ 0x00000001 });
+        }); // 2^28 * 2^8 = 2^36 -> 0x10 00000000.
+      });
+    }
+    assert(r == 0x11);
+  }
+
+  // mad_sat
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class mad_satSI1SI1SI1>([=]() {
+          AccR[0] = s::mad_sat(s::cl_int{ 0x10000000 }, s::cl_int{ 0x00000100 },
+                               s::cl_int{ 0x00000001 });
+        }); // 2^31 * 2^8 = 2^39 -> 0x80 00000000 -> reuslt is saturated in the
+            // product.
+      });
+    }
+    assert(r == 0x7FFFFFFF);
+  }
+
+  // mad_sat test two
+  {
+    char r(0);
+    char exp(120);
+    {
+      cl::sycl::buffer<char, 1> buf(&r, cl::sycl::range<1>(1));
+      cl::sycl::queue q;
+      q.submit([&](cl::sycl::handler &cgh) {
+        auto acc = buf.get_access<cl::sycl::access::mode::write>(cgh);
+        cgh.single_task<class kernel>([=]() {
+          signed char inputData_0(-17);
+          signed char inputData_1(-10);
+          signed char inputData_2(-50);
+          acc[0] = cl::sycl::mad_sat(inputData_0, inputData_1, inputData_2);
+        });
+      });
+    }
+    assert(r == exp); // Should return the real number of i0*i1+i2 in CPU
+                              // Only fails in vector, but passes in scalar.
+
+  }
+
+  // mul_hi
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class mul_hiSI1SI1>([=]() {
+          AccR[0] = s::mul_hi(s::cl_int{ 0x10000000 }, s::cl_int{ 0x00000100 });
+        }); // 2^28 * 2^8 = 2^36 -> 0x10 00000000.
+      });
+    }
+    assert(r == 0x10);
+  }
+
+  // mul_hi with negative result w/ carry
+  {
+    s::cl_int r{0};
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class mul_hiSI1SI2>([=]() {
+          AccR[0] = s::mul_hi(s::cl_int{-0x10000000}, s::cl_int{0x00000100});
+        }); // -2^28 * 2^8 = -2^36 -> -0x10 (FFFFFFF0) 00000000.
+      });
+    }
+    assert(r == -0x10);
+  }
+
+  // mul_hi with negative result w/o carry
+  {
+    s::cl_int r{0};
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class mul_hiSI1SI3>([=]() {
+          AccR[0] = s::mul_hi(s::cl_int{-0x10000000}, s::cl_int{0x00000101});
+        }); // -2^28 * (2^8 + 1) = -2^36 - 2^28 -> -0x11 (FFFFFFEF) -0x10000000
+            // (F0000000).
+      });
+    }
+    assert(r == -0x11);
+  }
+
+  // rotate
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class rotateSI1SI1>([=]() {
+          AccR[0] = s::rotate(s::cl_int{ 0x11100000 }, s::cl_int{ 12 });
+        });
+      });
+    }
+    assert(r == 0x00000111);
+  }
+
+  // rotate (with large rotate size)
+  {
+    s::cl_char r{ 0 };
+    {
+      s::buffer<s::cl_char, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class rotateSI1SI2>([=]() {
+          AccR[0] = s::rotate(static_cast<s::cl_char>((unsigned char)0xe0),
+              s::cl_char{ 50 });
+        });
+      });
+    }
+    assert((unsigned char)r == 0x83);
+  }
+  // sub_sat
+  {
+    auto TestSubSat = [](s::cl_int x, s::cl_int y) {
+      s::cl_int r{ 0 };
+      {
+        s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+        s::queue myQueue;
+        myQueue.submit([&](s::handler &cgh) {
+          auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+          cgh.single_task<class sub_satSI1SI1>([=]() {
+            AccR[0] = s::sub_sat(x, y);
+          });
+        });
+      }
+      return r;
+    };
+    // 10 - (-2^31(minimum value)) = saturates on Maximum value
+    s::cl_int r1 = TestSubSat(10, 0x80000000);
+    assert(r1 == 0x7FFFFFFF);
+    s::cl_int r2 = TestSubSat(0x7FFFFFFF, 0xFFFFFFFF);
+    assert(r2 == 0x7FFFFFFF);
+    s::cl_int r3 = TestSubSat(0x80000000, 0x00000001);
+    assert(r3 == 0x80000000);
+    s::cl_int r4 = TestSubSat(10499, 30678);
+    assert(r4 == -20179);
+  }
+
+  // upsample - 1
+  {
+    s::cl_ushort r{ 0 };
+    {
+      s::buffer<s::cl_ushort, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class upsampleUC1UC1>([=]() {
+          AccR[0] = s::upsample(s::cl_uchar{ 0x10 }, s::cl_uchar{ 0x10 });
+        });
+      });
+    }
+    assert(r == 0x1010);
+  }
+
+  // upsample - 2
+  {
+    s::cl_short r{ 0 };
+    {
+      s::buffer<s::cl_short, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class upsampleSC1UC1>([=]() {
+          AccR[0] = s::upsample(s::cl_char{ 0x10 }, s::cl_uchar{ 0x10 });
+        });
+      });
+    }
+    assert(r == 0x1010);
+  }
+
+  // upsample - 3
+  {
+    s::cl_uint r{ 0 };
+    {
+      s::buffer<s::cl_uint, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class upsampleUS1US1>([=]() {
+          AccR[0] = s::upsample(s::cl_ushort{ 0x0010 }, s::cl_ushort{ 0x0010 });
+        });
+      });
+    }
+    assert(r == 0x00100010);
+  }
+
+  // upsample - 4
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class upsampleSS1US1>([=]() {
+          AccR[0] = s::upsample(s::cl_short{ 0x0010 }, s::cl_ushort{ 0x0010 });
+        });
+      });
+    }
+    assert(r == 0x00100010);
+  }
+
+  // upsample - 5
+  {
+    s::cl_ulong r{ 0 };
+    {
+      s::buffer<s::cl_ulong, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class upsampleUI1UI1>([=]() {
+          AccR[0] =
+              s::upsample(s::cl_uint{ 0x00000010 }, s::cl_uint{ 0x00000010 });
+        });
+      });
+    }
+    assert(r == 0x0000001000000010);
+  }
+
+  // upsample - 6
+  {
+    s::cl_long r{ 0 };
+    {
+      s::buffer<s::cl_long, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class upsampleSI1UI1>([=]() {
+          AccR[0] =
+              s::upsample(s::cl_int{ 0x00000010 }, s::cl_uint{ 0x00000010 });
+        });
+      });
+    }
+    assert(r == 0x0000001000000010);
+  }
+
+  // popcount
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class popcountSI1>([=]() {
+          AccR[0] = s::popcount(s::cl_int{ 0x000000FF });
+        });
+      });
+    }
+    assert(r == 8);
+  }
+
+  // mad24
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class mad24SI1SI1SI1>([=]() {
+          AccR[0] =
+              s::mad24(s::cl_int(0xFFFFFFFF), s::cl_int{ 20 }, s::cl_int{ 20 });
+        });
+      });
+    }
+    assert(r == 0);
+  }
+
+  // mul24
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class mul24SI1SI1>([=]() {
+          AccR[0] = s::mul24(s::cl_int(0xFFFFFFFF), s::cl_int{ 20 });
+        });
+      });
+    }
+    assert(r == -20);
+  }
+
+  return 0;
+}
diff --git a/SYCL/Basic/built-ins/scalar_math.cpp b/SYCL/Basic/built-ins/scalar_math.cpp
new file mode 100644
index 0000000000..3be5be3d12
--- /dev/null
+++ b/SYCL/Basic/built-ins/scalar_math.cpp
@@ -0,0 +1,401 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+
+#include <array>
+#include <cassert>
+#include <cmath>
+
+namespace s = cl::sycl;
+
+int main() {
+  // acos
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class acosF1>([=]() {
+          AccR[0] = s::acos(s::cl_float{ 0.5 });
+        });
+      });
+    }
+    assert(r > 1.047f && r < 1.048f); // ~1.0471975511965979
+  }
+
+  // acosh
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class acoshF1>([=]() {
+          AccR[0] = s::acosh(s::cl_float{ 2.4 });
+        });
+      });
+    }
+    assert(r > 1.522f && r < 1.523f); // ~1.5220793674636532
+  }
+
+  // asin
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class asinF1>([=]() {
+          AccR[0] = s::asin(s::cl_float{ 0.5 });
+        });
+      });
+    }
+    assert(r > 0.523f && r < 0.524f); // ~0.5235987755982989
+  }
+
+  // asinh
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class asinhF1>([=]() {
+          AccR[0] = s::asinh(s::cl_float{ 0.5 });
+        });
+      });
+    }
+    assert(r > 0.481f && r < 0.482f); // ~0.48121182505960347
+  }
+
+  // atan
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class atanF1>([=]() {
+          AccR[0] = s::atan(s::cl_float{ 0.5 });
+        });
+      });
+    }
+    assert(r > 0.463f && r < 0.464f); // ~0.4636476090008061
+  }
+
+  // atanh
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class atanhF1>([=]() {
+          AccR[0] = s::atanh(s::cl_float{ 0.5 });
+        });
+      });
+    }
+    assert(r > 0.549f && r < 0.550f); // ~0.5493061443340549
+  }
+
+  // cbrt
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class cbrtF1>([=]() {
+          AccR[0] = s::cbrt(s::cl_float{ 27.0 });
+        });
+      });
+    }
+    assert(r == 3.f);
+  }
+
+  // ceil
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class ceilF1>([=]() {
+          AccR[0] = s::ceil(s::cl_float{ 0.5 });
+        });
+      });
+    }
+    assert(r == 1.f);
+  }
+
+  // cos
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class cosF1>([=]() {
+          AccR[0] = s::cos(s::cl_float{ 0.5 });
+        });
+      });
+    }
+    assert(r > 0.877f && r < 0.878f); // ~0.8775825618903728
+  }
+
+  // cosh
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class coshF1>([=]() {
+          AccR[0] = s::cosh(s::cl_float{ 0.5 });
+        });
+      });
+    }
+    assert(r > 1.127f && r < 1.128f); // ~1.1276259652063807
+  }
+
+  // cospi
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class cospiF1>([=]() {
+          AccR[0] = s::cospi(s::cl_float{ 0.1 });
+        });
+      });
+    }
+    assert(r > 0.951f && r < 0.952f); // ~0.9510565162951535
+  }
+
+  // erfc
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class erfcF1>([=]() {
+          AccR[0] = s::erfc(s::cl_float{ 0.5 });
+        });
+      });
+    }
+    assert(r > 0.479f && r < 0.480f); // ~0.4795001221869535
+  }
+
+  // erf
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class erfF1>([=]() {
+          AccR[0] = s::erf(s::cl_float{ 0.5 });
+        });
+      });
+    }
+    assert(r > 0.520f && r < 0.521f); // ~0.5204998778130465
+  }
+
+  // exp
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class expF1>([=]() {
+          AccR[0] = s::exp(s::cl_float{ 0.5 });
+        });
+      });
+    }
+    assert(r > 1.648f && r < 1.649f); // ~1.6487212707001282
+  }
+
+  // exp2
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class exp2F1>([=]() {
+          AccR[0] = s::exp2(s::cl_float{ 8.0 });
+        });
+      });
+    }
+    assert(r == 256.0f);
+  }
+
+  // exp10
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class exp10F1>([=]() {
+          AccR[0] = s::exp10(s::cl_float{ 2 });
+        });
+      });
+    }
+    assert(r == 100.0f);
+  }
+
+  // expm1
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class expm1F1>([=]() {
+          AccR[0] = s::expm1(s::cl_float{ 0.5 });
+        });
+      });
+    }
+    assert(r > 0.648f && r < 0.649f); // ~0.6487212707001282
+  }
+
+  // fabs
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class fabsF1>([=]() {
+          AccR[0] = s::fabs(s::cl_float{ -0.5 });
+        });
+      });
+    }
+    assert(r == 0.5f);
+  }
+
+  // floor
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class floorF1>([=]() {
+          AccR[0] = s::floor(s::cl_float{ 0.5 });
+        });
+      });
+    }
+    assert(r == 0.f);
+  }
+
+  // fmax
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class fmaxF1F1>([=]() {
+          AccR[0] = s::fmax(s::cl_float{ 0.5 }, s::cl_float{ 0.8 });
+        });
+      });
+    }
+    assert(r == 0.8f);
+  }
+
+  // fmin
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class fminF1F1>([=]() {
+          AccR[0] = s::fmin(s::cl_float{ 0.5 }, s::cl_float{ 0.8 });
+        });
+      });
+    }
+    assert(r == 0.5f);
+  }
+
+  // fmod
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class fmodF1F1>([=]() {
+          AccR[0] = s::fmod(s::cl_float{ 5.1 }, s::cl_float{ 3.0 });
+        });
+      });
+    }
+    assert(r == 2.1f);
+  }
+
+  // lgamma with private memory
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::read_write>(cgh);
+        cgh.single_task<class lgammaF1>([=]() {
+          AccR[0] = s::lgamma(s::cl_float{ 10.f });
+        });
+      });
+    }
+    assert(r > 12.8017f && r < 12.8019f); // ~12.8018
+  }
+
+  // lgamma with private memory
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::read_write>(cgh);
+        cgh.single_task<class lgammaF1_neg>([=]() {
+          AccR[0] = s::lgamma(s::cl_float{ -2.4f });
+        });
+      });
+    }
+    assert(r > 0.1024f && r < 0.1026f); // ~0.102583
+  }
+
+  return 0;
+}
diff --git a/SYCL/Basic/built-ins/scalar_math_2.cpp b/SYCL/Basic/built-ins/scalar_math_2.cpp
new file mode 100644
index 0000000000..7273842486
--- /dev/null
+++ b/SYCL/Basic/built-ins/scalar_math_2.cpp
@@ -0,0 +1,244 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// TODO: ptxas fatal   : Unresolved extern function '_Z18__spirv_ocl_acospif'
+// XFAIL: cuda
+
+#include <CL/sycl.hpp>
+
+#include <array>
+#include <cassert>
+#include <cmath>
+
+namespace s = cl::sycl;
+
+int main() {
+
+  // acospi
+  {
+    s::cl_float r{0};
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class acospiF1>([=]() {
+          AccR[0] = s::acospi(s::cl_float{0.5});
+        });
+      });
+    }
+    assert(r > 0.333f && r < 0.334f); // ~0.33333333333333337
+  }
+
+  // asinpi
+  {
+    s::cl_float r{0};
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class asinpiF1>([=]() {
+          AccR[0] = s::asinpi(s::cl_float{0.5});
+        });
+      });
+    }
+    assert(r > 0.166f && r < 0.167f); // ~0.16666666666666669
+  }
+
+  // atan2
+  {
+    s::cl_float r{0};
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class atan2F1F1>([=]() {
+          AccR[0] = s::atan2(s::cl_float{0.5}, s::cl_float{0.5});
+        });
+      });
+    }
+    assert(r > 0.785f && r < 0.786f); // ~0.7853981633974483
+  }
+
+  // atanpi
+  {
+    s::cl_float r{0};
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class atanpiF1>([=]() {
+          AccR[0] = s::atanpi(s::cl_float{0.5});
+        });
+      });
+    }
+    assert(r > 0.147f && r < 0.148f); // ~0.14758361765043326
+  }
+
+  // atan2pi
+  {
+    s::cl_float r{0};
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class atan2piF1F1>([=]() {
+          AccR[0] = s::atan2pi(s::cl_float{0.5}, s::cl_float{0.5});
+        });
+      });
+    }
+    assert(r > 0.249f && r < 0.251f); // ~0.25
+  }
+
+  // copysign
+  {
+    s::cl_float r{0};
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class copysignF1F1>([=]() {
+          AccR[0] = s::copysign(s::cl_float{1}, s::cl_float{-0.5});
+        });
+      });
+    }
+    assert(r == -1.f);
+  }
+
+  // fdim
+  {
+    s::cl_float r{0};
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class fdimF1F1>([=]() {
+          AccR[0] = s::fdim(s::cl_float{1.6}, s::cl_float{0.6});
+        });
+      });
+    }
+    assert(r == 1.0f);
+  }
+
+  // fma
+  {
+    s::cl_float r{0};
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class fmaF1F1F1>([=]() {
+          AccR[0] = s::fma(s::cl_float{0.5}, s::cl_float{10.0},
+                           s::cl_float{3.0});
+        });
+      });
+    }
+    assert(r == 8.0f);
+  }
+
+  // fract with global memory
+  {
+    s::cl_float r{0};
+    s::cl_float i{999};
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::buffer<s::cl_float, 1> BufI(&i, s::range<1>(1),
+                                     {s::property::buffer::use_host_ptr()});
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::read_write>(cgh);
+        auto AccI = BufI.get_access<s::access::mode::read_write>(cgh);
+        cgh.single_task<class fractF1GF1>([=]() {
+          s::global_ptr<s::cl_float> Iptr(AccI);
+          AccR[0] = s::fract(s::cl_float{1.5}, Iptr);
+        });
+      });
+    }
+    assert(r == 0.5f);
+    assert(i == 1.0f);
+  }
+
+  // fract with private memory
+  {
+    s::cl_float r{0};
+    s::cl_float i{999};
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::buffer<s::cl_float, 1> BufI(&i, s::range<1>(1),
+                                     {s::property::buffer::use_host_ptr()});
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::read_write>(cgh);
+        auto AccI = BufI.get_access<s::access::mode::read_write>(cgh);
+        cgh.single_task<class fractF1PF1>([=]() {
+          s::cl_float temp(0.0);
+          s::private_ptr<s::cl_float> Iptr(&temp);
+          AccR[0] = s::fract(s::cl_float{1.5f}, Iptr);
+          AccI[0] = *Iptr;
+        });
+      });
+    }
+    assert(r == 0.5f);
+    assert(i == 1.0f);
+  }
+
+  // lgamma_r with private memory
+  {
+    s::cl_float r{0};
+    s::cl_int i{999};
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::buffer<s::cl_int, 1> BufI(&i, s::range<1>(1),
+                                   {s::property::buffer::use_host_ptr()});
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::read_write>(cgh);
+        auto AccI = BufI.get_access<s::access::mode::read_write>(cgh);
+        cgh.single_task<class lgamma_rF1PI1>([=]() {
+          s::cl_int temp(0.0);
+          s::private_ptr<s::cl_int> Iptr(&temp);
+          AccR[0] = s::lgamma_r(s::cl_float{10.f}, Iptr);
+          AccI[0] = *Iptr;
+        });
+      });
+    }
+    assert(r > 12.8017f && r < 12.8019f); // ~12.8018
+    assert(i == 1);                       // tgamma of 10 is ~362880.0
+  }
+
+  // lgamma_r with private memory
+  {
+    s::cl_float r{0};
+    s::cl_int i{999};
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::buffer<s::cl_int, 1> BufI(&i, s::range<1>(1),
+                                   {s::property::buffer::use_host_ptr()});
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::read_write>(cgh);
+        auto AccI = BufI.get_access<s::access::mode::read_write>(cgh);
+        cgh.single_task<class lgamma_rF1PI1_neg>([=]() {
+          s::cl_int temp(0.0);
+          s::private_ptr<s::cl_int> Iptr(&temp);
+          AccR[0] = s::lgamma_r(s::cl_float{-2.4f}, Iptr);
+          AccI[0] = *Iptr;
+        });
+      });
+    }
+    assert(r > 0.1024f && r < 0.1026f); // ~0.102583
+    assert(i == -1);                    // tgamma of -2.4 is ~-1.1080299470333461
+  }
+
+  return 0;
+}
diff --git a/SYCL/Basic/built-ins/scalar_relational.cpp b/SYCL/Basic/built-ins/scalar_relational.cpp
new file mode 100644
index 0000000000..a3c7b1d7df
--- /dev/null
+++ b/SYCL/Basic/built-ins/scalar_relational.cpp
@@ -0,0 +1,422 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// TODO: ptxas fatal   : Unresolved extern function '_Z17__spirv_FOrdEqualff'
+// XFAIL: cuda
+
+#include <CL/sycl.hpp>
+
+#include <cassert>
+#include <cmath>
+
+namespace s = cl::sycl;
+
+int main() {
+  // isequal-float
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class isequalF1F1>([=]() {
+          AccR[0] = s::isequal(s::cl_float{ 10.5f }, s::cl_float{ 10.5f });
+        });
+      });
+    }
+    assert(r == 1);
+  }
+
+  // isnotequal-float
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class isnotequalF1F1>([=]() {
+          AccR[0] = s::isnotequal(s::cl_float{ 0.4f }, s::cl_float{ 0.5f });
+        });
+      });
+    }
+    assert(r == 1);
+  }
+
+  // isgreater-float
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class isgreaterF1F1>([=]() {
+          AccR[0] = s::isgreater(s::cl_float{ 0.6f }, s::cl_float{ 0.5f });
+        });
+      });
+    }
+    assert(r == 1);
+  }
+
+  // isgreaterequal-float
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class isgreaterequalF1F1>([=]() {
+          AccR[0] = s::isgreaterequal(s::cl_float{ 0.5f }, s::cl_float{ 0.5f });
+        });
+      });
+    }
+    assert(r == 1);
+  }
+
+  // isless-float
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class islessF1F1>([=]() {
+          AccR[0] = s::isless(s::cl_float{ 0.4f }, s::cl_float{ 0.5f });
+        });
+      });
+    }
+    assert(r == 1);
+  }
+
+  // islessequal-float
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class islessequalF1F1>([=]() {
+          AccR[0] = s::islessequal(s::cl_float{ 0.5f }, s::cl_float{ 0.5f });
+        });
+      });
+    }
+    assert(r == 1);
+  }
+
+  // islessgreater-float
+  {
+    s::cl_int r{ 1 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class islessgreaterF1F1>([=]() {
+          AccR[0] = s::islessgreater(s::cl_float{ 0.5f }, s::cl_float{ 0.5f });
+        });
+      });
+    }
+    assert(r == 0);
+  }
+
+  // isfinite-float
+  {
+    s::cl_int r{ 1 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class isfiniteF1>([=]() {
+          AccR[0] = s::isfinite(s::cl_float{ NAN });
+        });
+      });
+    }
+    assert(r == 0);
+  }
+
+  // isinf-float
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class isinfF1>([=]() {
+          AccR[0] = s::isinf(s::cl_float{ INFINITY });
+        });
+      });
+    }
+    assert(r == 1);
+  }
+
+  // isnan-float
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class isnanF1>([=]() {
+          AccR[0] = s::isnan(s::cl_float{ NAN });
+        });
+      });
+    }
+    assert(r == 1);
+  }
+
+  // isnormal-float
+  {
+    s::cl_int r{ 1 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class isnormalF1>([=]() {
+          AccR[0] = s::isnormal(s::cl_float{ INFINITY });
+        });
+      });
+    }
+    assert(r == 0);
+  }
+
+  // isnormal-double
+  {
+    s::cl_int r{ 1 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class isnormalD1>([=]() {
+          AccR[0] = s::isnormal(s::cl_double{ INFINITY });
+        });
+      });
+    }
+    assert(r == 0);
+  }
+
+  // isordered-float
+  {
+    s::cl_int r{ 1 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class isorderedF1F1>([=]() {
+          AccR[0] = s::isordered(s::cl_float{ 4.0f }, s::cl_float{ NAN });
+        });
+      });
+    }
+    assert(r == 0);
+  }
+
+  // isunordered-float
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class isunorderedF1F1>([=]() {
+          AccR[0] = s::isunordered(s::cl_float{ 4.0f }, s::cl_float{ NAN });
+        });
+      });
+    }
+    assert(r == 1);
+  }
+
+  // signbit-float
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class signbitF1>([=]() {
+          AccR[0] = s::signbit(s::cl_float{ -12.0f });
+        });
+      });
+    }
+    assert(r == 1);
+  }
+
+  // any-integer
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class anyF1positive>([=]() {
+          AccR[0] = s::any(s::cl_int{ 12 });
+        });
+      });
+    }
+    assert(r == 0);
+  }
+  // any-integer
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class anyF1zero>([=]() {
+          AccR[0] = s::any(s::cl_int{ 0 });
+        });
+      });
+    }
+    assert(r == 0);
+  }
+
+  // any-integer
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class anyF1negative>([=]() {
+          AccR[0] = s::any(s::cl_int{ -12 });
+        });
+      });
+    }
+    assert(r == 1);
+  }
+
+  // all-integer
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class allF1positive>([=]() {
+          AccR[0] = s::all(s::cl_int{ 12 });
+        });
+      });
+    }
+    assert(r == 0);
+  }
+
+  // all-integer
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class allF1zero>([=]() {
+          AccR[0] = s::all(s::cl_int{ 0 });
+        });
+      });
+    }
+    assert(r == 0);
+  }
+
+  // all-integer
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class allF1negative>([=]() {
+          AccR[0] = s::all(s::cl_int{ -12 });
+        });
+      });
+    }
+    assert(r == 1);
+  }
+
+  // bitselect-float
+  {
+    s::cl_float r{ 0.0f };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class bitselectF1F1F1>([=]() {
+          AccR[0] = s::bitselect(s::cl_float{ 112.112 }, s::cl_float{ 34.34 },
+                                 s::cl_float{ 3.3 });
+        });
+      });
+    }
+    assert(r <= 80.5478 && r >= 80.5476); // r = 80.5477
+  }
+
+  // select-float,int
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class selectF1F1I1positive>([=]() {
+          AccR[0] = s::select(s::cl_float{ 34.34 }, s::cl_float{ 123.123 },
+                              s::cl_int{ 1 });
+        });
+      });
+    }
+    assert(r <= 123.124 && r >= 123.122); // r = 123.123
+  }
+
+  // select-float,int
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class selectF1F1I1zero>([=]() {
+          AccR[0] = s::select(s::cl_float{ 34.34 }, s::cl_float{ 123.123 },
+                              s::cl_int{ 0 });
+        });
+      });
+    }
+    assert(r <= 34.35 && r >= 34.33); // r = 34.34
+  }
+
+  // select-float,int
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class selectF1F1I1negative>([=]() {
+          AccR[0] = s::select(s::cl_float{ 34.34 }, s::cl_float{ 123.123 },
+                              s::cl_int{ -1 });
+        });
+      });
+    }
+    assert(r <= 123.124 && r >= 123.122); // r = 123.123
+  }
+
+  return 0;
+}
diff --git a/SYCL/Basic/built-ins/vector_common.cpp b/SYCL/Basic/built-ins/vector_common.cpp
new file mode 100644
index 0000000000..646d7a3ef5
--- /dev/null
+++ b/SYCL/Basic/built-ins/vector_common.cpp
@@ -0,0 +1,57 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// TODO: ptxas fatal   : Unresolved extern function '_Z23__spirv_ocl_fmax_commonDv2_fS_'
+// XFAIL: cuda
+
+#include <CL/sycl.hpp>
+
+#include <cassert>
+
+namespace s = cl::sycl;
+
+int main() {
+  // max
+  {
+    s::cl_float2 r{ 0 };
+    {
+      s::buffer<s::cl_float2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class maxF2F2>([=]() {
+          AccR[0] =
+              s::max(s::cl_float2{ 0.5f, 3.4f }, s::cl_float2{ 2.3f, 0.4f });
+        });
+      });
+    }
+    s::cl_float r1 = r.x();
+    s::cl_float r2 = r.y();
+    assert(r1 == 2.3f);
+    assert(r2 == 3.4f);
+  }
+
+  // max
+  {
+    s::cl_float2 r{ 0 };
+    {
+      s::buffer<s::cl_float2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class maxF2F1>([=]() {
+          AccR[0] = s::max(s::cl_float2{ 0.5f, 3.4f }, s::cl_float{ 3.0f });
+        });
+      });
+    }
+    s::cl_float r1 = r.x();
+    s::cl_float r2 = r.y();
+    assert(r1 == 3.0f);
+    assert(r2 == 3.4f);
+  }
+
+  return 0;
+}
diff --git a/SYCL/Basic/built-ins/vector_geometric.cpp b/SYCL/Basic/built-ins/vector_geometric.cpp
new file mode 100644
index 0000000000..deb3048019
--- /dev/null
+++ b/SYCL/Basic/built-ins/vector_geometric.cpp
@@ -0,0 +1,171 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// TODO: ptxas fatal   : Unresolved extern function '_Z11__spirv_DotDv2_fS_'
+// XFAIL: cuda
+
+#include <CL/sycl.hpp>
+
+#include <cassert>
+#include <cmath>
+
+namespace s = cl::sycl;
+
+bool isFloatEqualTo(float x, float y, float epsilon = 0.005f) {
+  return std::fabs(x - y) <= epsilon;
+}
+
+int main() {
+  // dot
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class dotF2F2>([=]() {
+          AccR[0] = s::dot(s::cl_float2{ 1.f, 2.f, }, s::cl_float2{ 4.f, 6.f });
+        });
+      });
+    }
+    assert(r == 16.f);
+  }
+
+  // cross
+  {
+    s::cl_float4 r{ 0 };
+    {
+      s::buffer<s::cl_float4, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class crossF4>([=]() {
+          AccR[0] = s::cross(s::cl_float4{ 2.f, 3.f, 4.f, 0.f, },
+                             s::cl_float4{ 5.f, 6.f, 7.f, 0.f, });
+        });
+      });
+    }
+
+    s::cl_float r1 = r.x();
+    s::cl_float r2 = r.y();
+    s::cl_float r3 = r.z();
+    s::cl_float r4 = r.w();
+
+    assert(r1 == -3.f);
+    assert(r2 == 6.f);
+    assert(r3 == -3.f);
+    assert(r4 == 0.0f);
+  }
+
+  // distance
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class distanceF2>([=]() {
+          AccR[0] =
+              s::distance(s::cl_float2{ 1.f, 2.f, }, s::cl_float2{ 3.f, 4.f, });
+        });
+      });
+    }
+    assert(isFloatEqualTo(r, 2.82843f));
+  }
+
+  // length
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class lengthF2>([=]() {
+          AccR[0] = s::length(s::cl_float2{ 1.f, 2.f, });
+        });
+      });
+    }
+    assert(isFloatEqualTo(r, 2.23607f));
+  }
+
+  // normalize
+  {
+    s::cl_float2 r{ 0 };
+    {
+      s::buffer<s::cl_float2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class normalizeF2>([=]() {
+          AccR[0] = s::normalize(s::cl_float2{ 1.f, 2.f, });
+        });
+      });
+    }
+    s::cl_float r1 = r.x();
+    s::cl_float r2 = r.y();
+
+    assert(isFloatEqualTo(r1, 0.447214f));
+    assert(isFloatEqualTo(r2, 0.894427f));
+  }
+
+  // fast_distance
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class fast_distanceF2>([=]() {
+          AccR[0] = s::fast_distance(s::cl_float2{ 1.f, 2.f, },
+                                     s::cl_float2{ 3.f, 4.f, });
+        });
+      });
+    }
+    assert(isFloatEqualTo(r, 2.82843f));
+  }
+
+  // fast_length
+  {
+    s::cl_float r{ 0 };
+    {
+      s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class fast_lengthF2>([=]() {
+          AccR[0] = s::fast_length(s::cl_float2{ 1.f, 2.f, });
+        });
+      });
+    }
+    assert(isFloatEqualTo(r, 2.23607f));
+  }
+
+  // fast_normalize
+  {
+    s::cl_float2 r{ 0 };
+    {
+      s::buffer<s::cl_float2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class fast_normalizeF2>([=]() {
+          AccR[0] = s::fast_normalize(s::cl_float2{ 1.f, 2.f, });
+        });
+      });
+    }
+    s::cl_float r1 = r.x();
+    s::cl_float r2 = r.y();
+
+    assert(isFloatEqualTo(r1, 0.447144));
+    assert(isFloatEqualTo(r2, 0.894287));
+  }
+
+  return 0;
+}
diff --git a/SYCL/Basic/built-ins/vector_integer.cpp b/SYCL/Basic/built-ins/vector_integer.cpp
new file mode 100644
index 0000000000..3ce8bf49d0
--- /dev/null
+++ b/SYCL/Basic/built-ins/vector_integer.cpp
@@ -0,0 +1,701 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// TODO: ptxas fatal   : Unresolved extern function '_Z17__spirv_ocl_s_maxDv2_iS_'
+// XFAIL: cuda
+
+#include <CL/sycl.hpp>
+
+#include <array>
+#include <cassert>
+
+namespace s = cl::sycl;
+
+int main() {
+  // max
+  {
+    s::cl_int2 r{ 0 };
+    {
+      s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class maxSI2SI2>([=]() {
+          AccR[0] = s::max(s::cl_int2{ 5, 3 }, s::cl_int2{ 2, 7 });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    assert(r1 == 5);
+    assert(r2 == 7);
+  }
+
+  // max
+  {
+    s::cl_uint2 r{ 0 };
+    {
+      s::buffer<s::cl_uint2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class maxUI2UI2>([=]() {
+          AccR[0] = s::max(s::cl_uint2{ 5, 3 }, s::cl_uint2{ 2, 7 });
+        });
+      });
+    }
+    s::cl_uint r1 = r.x();
+    s::cl_uint r2 = r.y();
+    assert(r1 == 5);
+    assert(r2 == 7);
+  }
+
+  // max
+  {
+    s::cl_int2 r{ 0 };
+    {
+      s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class maxSI2SI1>([=]() {
+          AccR[0] = s::max(s::cl_int2{ 5, 3 }, s::cl_int{ 2 });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    assert(r1 == 5);
+    assert(r2 == 3);
+  }
+
+  // max (longlong2)
+  {
+    s::longlong2 r{ 0 };
+    {
+      s::buffer<s::longlong2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class maxSLL2SLL1>([=]() {
+          AccR[0] = s::max(s::longlong2{ 5, 3 }, s::longlong{ 2 });
+        });
+      });
+    }
+    s::longlong r1 = r.x();
+    s::longlong r2 = r.y();
+    assert(r1 == 5);
+    assert(r2 == 3);
+  }
+
+  // max
+  {
+    s::cl_uint2 r{ 0 };
+    {
+      s::buffer<s::cl_uint2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class maxUI2UI1>([=]() {
+          AccR[0] = s::max(s::cl_uint2{ 5, 3 }, s::cl_uint{ 2 });
+        });
+      });
+    }
+    s::cl_uint r1 = r.x();
+    s::cl_uint r2 = r.y();
+    assert(r1 == 5);
+    assert(r2 == 3);
+  }
+
+  // max (ulonglong2)
+  {
+    s::ulonglong2 r{ 0 };
+    {
+      s::buffer<s::ulonglong2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class maxULL2ULL1>([=]() {
+          AccR[0] = s::max(s::ulonglong2{ 5, 3 }, s::ulonglong{ 2 });
+        });
+      });
+    }
+    s::ulonglong r1 = r.x();
+    s::ulonglong r2 = r.y();
+    assert(r1 == 5);
+    assert(r2 == 3);
+  }
+  
+  // min
+  {
+    s::cl_int2 r{ 0 };
+    {
+      s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class minSI2SI2>([=]() {
+          AccR[0] = s::min(s::cl_int2{ 5, 3 }, s::cl_int2{ 2, 7 });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    assert(r1 == 2);
+    assert(r2 == 3);
+  }
+
+  // min
+  {
+    s::cl_uint2 r{ 0 };
+    {
+      s::buffer<s::cl_uint2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class minUI2UI2>([=]() {
+          AccR[0] = s::min(s::cl_uint2{ 5, 3 }, s::cl_uint2{ 2, 7 });
+        });
+      });
+    }
+    s::cl_uint r1 = r.x();
+    s::cl_uint r2 = r.y();
+    assert(r1 == 2);
+    assert(r2 == 3);
+  }
+
+  // min
+  {
+    s::cl_int2 r{ 0 };
+    {
+      s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class minSI2SI1>([=]() {
+          AccR[0] = s::min(s::cl_int2{ 5, 3 }, s::cl_int{ 2 });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    assert(r1 == 2);
+    assert(r2 == 2);
+  }
+
+  // min
+  {
+    s::cl_uint2 r{ 0 };
+    {
+      s::buffer<s::cl_uint2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class minUI2UI1>([=]() {
+          AccR[0] = s::min(s::cl_uint2{ 5, 3 }, s::cl_uint{ 2 });
+        });
+      });
+    }
+    s::cl_uint r1 = r.x();
+    s::cl_uint r2 = r.y();
+    assert(r1 == 2);
+    assert(r2 == 2);
+  }
+
+  // abs
+  {
+    s::cl_uint2 r{ 0 };
+    {
+      s::buffer<s::cl_uint2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class absSI2>([=]() {
+          AccR[0] = s::abs(s::cl_int2{ -5, -2 });
+        });
+      });
+    }
+    s::cl_uint r1 = r.x();
+    s::cl_uint r2 = r.y();
+    assert(r1 == 5);
+    assert(r2 == 2);
+  }
+
+  // abs (longlong)
+  {
+    s::ulonglong2 r{ 0 };
+    {
+      s::buffer<s::ulonglong2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class absSL2>([=]() {
+          AccR[0] = s::abs(s::longlong2{ -5, -2 });
+        });
+      });
+    }
+    s::ulonglong r1 = r.x();
+    s::ulonglong r2 = r.y();
+    assert(r1 == 5);
+    assert(r2 == 2);
+  }
+
+  // abs_diff
+  {
+    s::cl_uint2 r{ 0 };
+    {
+      s::buffer<s::cl_uint2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class abs_diffSI2SI2>([=]() {
+          AccR[0] = s::abs_diff(s::cl_int2{ -5, -2 }, s::cl_int2{ -1, -1 });
+        });
+      });
+    }
+    s::cl_uint r1 = r.x();
+    s::cl_uint r2 = r.y();
+    assert(r1 == 4);
+    assert(r2 == 1);
+  }
+
+  // add_sat
+  {
+    s::cl_int2 r{ 0 };
+    {
+      s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class add_satSI2SI2>([=]() {
+          AccR[0] = s::add_sat(s::cl_int2{ 0x7FFFFFFF, 0x7FFFFFFF },
+                               s::cl_int2{ 100, 90 });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    assert(r1 == 0x7FFFFFFF);
+    assert(r2 == 0x7FFFFFFF);
+  }
+
+  // hadd
+  {
+    s::cl_int2 r{ 0 };
+    {
+      s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class haddSI2SI2>([=]() {
+          AccR[0] = s::hadd(s::cl_int2{ 0x0000007F, 0x0000007F },
+                            s::cl_int2{ 0x00000020, 0x00000020 });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    assert(r1 == 0x0000004F);
+    assert(r2 == 0x0000004F);
+  }
+
+  // rhadd
+  {
+    s::cl_int2 r{ 0 };
+    {
+      s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class rhaddSI2SI2>([=]() {
+          AccR[0] = s::rhadd(s::cl_int2{ 0x0000007F, 0x0000007F },
+                             s::cl_int2{ 0x00000020, 0x00000020 });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    assert(r1 == 0x00000050);
+    assert(r2 == 0x00000050);
+  }
+
+  // clamp - 1
+  {
+    s::cl_int2 r{ 0 };
+    {
+      s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class clampSI2SI2SI2>([=]() {
+          AccR[0] = s::clamp(s::cl_int2{ 5, 5 }, s::cl_int2{ 10, 10 },
+                             s::cl_int2{ 30, 30 });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    assert(r1 == 10);
+    assert(r2 == 10);
+  }
+
+  // clamp - 2
+  {
+    s::cl_int2 r{ 0 };
+    {
+      s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class clampSI2SI1SI1>([=]() {
+          AccR[0] =
+              s::clamp(s::cl_int2{ 5, 5 }, s::cl_int{ 10 }, s::cl_int{ 30 });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    assert(r1 == 10);
+    assert(r2 == 10);
+  }
+
+  // clz
+  {
+    s::cl_int2 r{ 0 };
+    {
+      s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class clzSI2>([=]() {
+          AccR[0] = s::clz(s::cl_int2{ 0x0FFFFFFF, 0x0FFFFFFF });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    assert(r1 == 4);
+    assert(r2 == 4);
+  }
+
+  // ctz
+  {
+    s::cl_int2 r{ 0 };
+    {
+      s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class ctzSI2>([=]() {
+          AccR[0] = s::intel::ctz(s::cl_int2{ 0x7FFFFFF0, 0x7FFFFFF0 });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    assert(r1 == 4);
+    assert(r2 == 4);
+  }
+
+  // mad_hi
+  {
+    s::cl_int2 r{ 0 };
+    {
+      s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class mad_hiSI2SI2SI2>([=]() {
+          AccR[0] = s::mad_hi(s::cl_int2{ 0x10000000, 0x10000000 },
+                              s::cl_int2{ 0x00000100, 0x00000100 },
+                              s::cl_int2{ 1, 1 });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    assert(r1 == 0x11);
+    assert(r2 == 0x11);
+  }
+
+  // mad_sat
+  {
+    s::cl_int2 r{ 0 };
+    {
+      s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class mad_satSI2SI2SI2>([=]() {
+          AccR[0] = s::mad_sat(s::cl_int2{ 0x10000000, 0x10000000 },
+                               s::cl_int2{ 0x00000100, 0x00000100 },
+                               s::cl_int2{ 1, 1 });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    assert(r1 == 0x7FFFFFFF);
+    assert(r2 == 0x7FFFFFFF);
+  }
+
+  // mul_hi
+  {
+    s::cl_int2 r{ 0 };
+    {
+      s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class mul_hiSI2SI2>([=]() {
+          AccR[0] = s::mul_hi(s::cl_int2{ 0x10000000, 0x10000000 },
+                              s::cl_int2{ 0x00000100, 0x00000100 });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    assert(r1 == 0x10);
+    assert(r2 == 0x10);
+  }
+
+  // rotate
+  {
+    s::cl_int2 r{ 0 };
+    {
+      s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class rotateSI2SI2>([=]() {
+          AccR[0] = s::rotate(s::cl_int2{ 0x11100000, 0x11100000 },
+                              s::cl_int2{ 12, 12 });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    assert(r1 == 0x00000111);
+    assert(r2 == 0x00000111);
+  }
+
+  // sub_sat
+  {
+    auto TestSubSat = [](s::cl_int2 x, s::cl_int2 y) {
+      s::cl_int2 r{ 0 };
+      {
+        s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
+        s::queue myQueue;
+        myQueue.submit([&](s::handler &cgh) {
+          auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+          cgh.single_task<class sub_satSI2SI2>([=]() {
+            AccR[0] = s::sub_sat(x, y);
+          });
+        });
+      }
+      return r;
+    };
+    s::cl_int2 r1 = TestSubSat(s::cl_int2{ 10, 10 },
+                               s::cl_int2{ 0x80000000, 0x80000000 });
+    s::cl_int r1x = r1.x();
+    s::cl_int r1y = r1.y();
+    assert(r1x == 0x7FFFFFFF);
+    assert(r1y == 0x7FFFFFFF);
+    s::cl_int2 r2 = TestSubSat(s::cl_int2{ 0x7FFFFFFF, 0x80000000 },
+                               s::cl_int2{ 0xFFFFFFFF, 0x00000001 });
+    s::cl_int r2x = r2.x();
+    s::cl_int r2y = r2.y();
+    assert(r2x == 0x7FFFFFFF);
+    assert(r2y == 0x80000000);
+    s::cl_int2 r3 = TestSubSat(s::cl_int2{ 10499, 30678 },
+                               s::cl_int2{ 30678, 10499 });
+    s::cl_int r3x = r3.x();
+    s::cl_int r3y = r3.y();
+    assert(r3x == -20179);
+    assert(r3y ==  20179);
+  }
+
+  // upsample - 1
+  {
+    s::cl_ushort2 r{ 0 };
+    {
+      s::buffer<s::cl_ushort2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class upsampleUC2UC2>([=]() {
+          AccR[0] = s::upsample(s::cl_uchar2{ 0x10, 0x10 },
+                                s::cl_uchar2{ 0x10, 0x10 });
+        });
+      });
+    }
+    s::cl_ushort r1 = r.x();
+    s::cl_ushort r2 = r.y();
+    assert(r1 == 0x1010);
+    assert(r2 == 0x1010);
+  }
+
+  // upsample - 2
+  {
+    s::cl_short2 r{ 0 };
+    {
+      s::buffer<s::cl_short2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class upsampleSC2UC2>([=]() {
+          AccR[0] = s::upsample(s::cl_char2{ 0x10, 0x10 },
+                                s::cl_uchar2{ 0x10, 0x10 });
+        });
+      });
+    }
+    s::cl_short r1 = r.x();
+    s::cl_short r2 = r.y();
+    assert(r1 == 0x1010);
+    assert(r2 == 0x1010);
+  }
+
+  // upsample - 3
+  {
+    s::cl_uint2 r{ 0 };
+    {
+      s::buffer<s::cl_uint2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class upsampleUS2US2>([=]() {
+          AccR[0] = s::upsample(s::cl_ushort2{ 0x0010, 0x0010 },
+                                s::cl_ushort2{ 0x0010, 0x0010 });
+        });
+      });
+    }
+    s::cl_uint r1 = r.x();
+    s::cl_uint r2 = r.y();
+    assert(r1 == 0x00100010);
+    assert(r2 == 0x00100010);
+  }
+
+  // upsample - 4
+  {
+    s::cl_int2 r{ 0 };
+    {
+      s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class upsampleSS2US2>([=]() {
+          AccR[0] = s::upsample(s::cl_short2{ 0x0010, 0x0010 },
+                                s::cl_ushort2{ 0x0010, 0x0010 });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    assert(r1 == 0x00100010);
+    assert(r2 == 0x00100010);
+  }
+
+  // upsample - 5
+  {
+    s::cl_ulong2 r{ 0 };
+    {
+      s::buffer<s::cl_ulong2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class upsampleUI2UI2>([=]() {
+          AccR[0] = s::upsample(s::cl_uint2{ 0x00000010, 0x00000010 },
+                                s::cl_uint2{ 0x00000010, 0x00000010 });
+        });
+      });
+    }
+    s::cl_ulong r1 = r.x();
+    s::cl_ulong r2 = r.y();
+    assert(r1 == 0x0000001000000010);
+    assert(r2 == 0x0000001000000010);
+  }
+
+  // upsample - 6
+  {
+    s::cl_long2 r{ 0 };
+    {
+      s::buffer<s::cl_long2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class upsampleSI2UI2>([=]() {
+          AccR[0] = s::upsample(s::cl_int2{ 0x00000010, 0x00000010 },
+                                s::cl_uint2{ 0x00000010, 0x00000010 });
+        });
+      });
+    }
+    s::cl_long r1 = r.x();
+    s::cl_long r2 = r.y();
+    assert(r1 == 0x0000001000000010);
+    assert(r2 == 0x0000001000000010);
+  }
+
+  // popcount
+  {
+    s::cl_int2 r{ 0 };
+    {
+      s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class popcountSI2>([=]() {
+          AccR[0] = s::popcount(s::cl_int2{ 0x000000FF, 0x000000FF });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    assert(r1 == 8);
+    assert(r2 == 8);
+  }
+
+  // mad24
+  {
+    s::cl_int2 r{ 0 };
+    {
+      s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class mad24SI2SI2SI2>([=]() {
+          AccR[0] = s::mad24(s::cl_int2{ 0xFFFFFFFF, 0xFFFFFFFF },
+                             s::cl_int2{ 20, 20 }, s::cl_int2{ 20, 20 });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    assert(r1 == 0);
+    assert(r2 == 0);
+  }
+
+  // mul24
+  {
+    s::cl_int2 r{ 0 };
+    {
+      s::buffer<s::cl_int2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class mul24SI2SI2SI2>([=]() {
+          AccR[0] = s::mul24(s::cl_int2{ 0xFFFFFFFF, 0xFFFFFFFF },
+                             s::cl_int2{ 20, 20 });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    assert(r1 == -20);
+    assert(r2 == -20);
+  }
+
+  return 0;
+}
diff --git a/SYCL/Basic/built-ins/vector_math.cpp b/SYCL/Basic/built-ins/vector_math.cpp
new file mode 100644
index 0000000000..951f2c9070
--- /dev/null
+++ b/SYCL/Basic/built-ins/vector_math.cpp
@@ -0,0 +1,210 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// TODO: ptxas fatal   : Unresolved extern function '_Z17__spirv_ocl_fractDv2_fPU3AS0S_'
+// XFAIL: cuda
+
+#include <CL/sycl.hpp>
+
+#include <array>
+#include <cassert>
+#include <cmath>
+
+namespace s = cl::sycl;
+
+int main() {
+  // fmin
+  {
+    s::cl_float2 r{ 0 };
+    {
+      s::buffer<s::cl_float2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class fminF2F2>([=]() {
+          AccR[0] =
+              s::fmin(s::cl_float2{ 0.5f, 3.4f }, s::cl_float2{ 2.3f, 0.4f });
+        });
+      });
+    }
+    s::cl_float r1 = r.x();
+    s::cl_float r2 = r.y();
+    assert(r1 == 0.5f);
+    assert(r2 == 0.4f);
+  }
+
+  // fabs
+  {
+    s::cl_float2 r{ 0 };
+    {
+      s::buffer<s::cl_float2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class fabsF2>([=]() {
+          AccR[0] = s::fabs(s::cl_float2{ -1.0f, 2.0f });
+        });
+      });
+    }
+    s::cl_float r1 = r.x();
+    s::cl_float r2 = r.y();
+    assert(r1 == 1.0f);
+    assert(r2 == 2.0f);
+  }
+
+  // floor
+  {
+    s::cl_float2 r{ 0 };
+    {
+      s::buffer<s::cl_float2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class floorF2>([=]() {
+          AccR[0] = s::floor(s::cl_float2{ 1.4f, 2.8f });
+        });
+      });
+    }
+    s::cl_float r1 = r.x();
+    s::cl_float r2 = r.y();
+    assert(r1 == 1.0f);
+    assert(r2 == 2.0f);
+  }
+
+  // ceil
+  {
+    s::cl_float2 r{ 0 };
+    {
+      s::buffer<s::cl_float2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class ceilF2>([=]() {
+          AccR[0] = s::ceil(s::cl_float2{ 1.4f, 2.8f });
+        });
+      });
+    }
+    s::cl_float r1 = r.x();
+    s::cl_float r2 = r.y();
+    assert(r1 == 2);
+    assert(r2 == 3);
+  }
+
+  // fract with global memory
+  {
+    s::cl_float2 r{ 0, 0 };
+    s::cl_float2 i{ 0, 0 };
+    {
+      s::buffer<s::cl_float2, 1> BufR(&r, s::range<1>(1));
+      s::buffer<s::cl_float2, 1> BufI(&i, s::range<1>(1));
+
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::read_write>(cgh);
+        auto AccI = BufI.get_access<s::access::mode::read_write>(cgh);
+        cgh.single_task<class fractF2GF2>([=]() {
+          s::global_ptr<s::cl_float2> Iptr(AccI);
+          AccR[0] = s::fract(s::cl_float2{ 1.5f, 2.5f }, Iptr);
+        });
+      });
+    }
+
+    s::cl_float r1 = r.x();
+    s::cl_float r2 = r.y();
+    s::cl_float i1 = i.x();
+    s::cl_float i2 = i.y();
+
+    assert(r1 == 0.5f);
+    assert(r2 == 0.5f);
+    assert(i1 == 1.0f);
+    assert(i2 == 2.0f);
+  }
+
+  // fract with private memory
+  {
+    s::cl_float2 r{ 0, 0 };
+    s::cl_float2 i{ 0, 0 };
+    {
+      s::buffer<s::cl_float2, 1> BufR(&r, s::range<1>(1));
+      s::buffer<s::cl_float2, 1> BufI(&i, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::read_write>(cgh);
+        auto AccI = BufI.get_access<s::access::mode::read_write>(cgh);
+        cgh.single_task<class fractF2PF2>([=]() {
+          s::cl_float2 temp(0.0);
+          s::private_ptr<s::cl_float2> Iptr(&temp);
+          AccR[0] = s::fract(s::cl_float2{ 1.5f, 2.5f }, Iptr);
+          AccI[0] = *Iptr;
+        });
+      });
+    }
+
+    s::cl_float r1 = r.x();
+    s::cl_float r2 = r.y();
+    s::cl_float i1 = i.x();
+    s::cl_float i2 = i.y();
+
+    assert(r1 == 0.5f);
+    assert(r2 == 0.5f);
+    assert(i1 == 1.0f);
+    assert(i2 == 2.0f);
+  }
+
+  // lgamma with private memory
+  {
+    s::cl_float2 r{ 0, 0 };
+    {
+      s::buffer<s::cl_float2, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::read_write>(cgh);
+        cgh.single_task<class lgamma_rF2>([=]() {
+          AccR[0] = s::lgamma(s::cl_float2{ 10.f, -2.4f });
+        });
+      });
+    }
+
+    s::cl_float r1 = r.x();
+    s::cl_float r2 = r.y();
+
+    assert(r1 > 12.8017f && r1 < 12.8019f); // ~12.8018
+    assert(r2 > 0.1024f && r2 < 0.1026f);   // ~0.102583
+  }
+
+  // lgamma_r with private memory
+  {
+    s::cl_float2 r{ 0, 0 };
+    s::cl_int2 i{ 0, 0 };
+    {
+      s::buffer<s::cl_float2, 1> BufR(&r, s::range<1>(1));
+      s::buffer<s::cl_int2, 1> BufI(&i, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::read_write>(cgh);
+        auto AccI = BufI.get_access<s::access::mode::read_write>(cgh);
+        cgh.single_task<class lgamma_rF2PF2>([=]() {
+          s::cl_int2 temp(0.0);
+          s::private_ptr<s::cl_int2> Iptr(&temp);
+          AccR[0] = s::lgamma_r(s::cl_float2{ 10.f, -2.4f }, Iptr);
+          AccI[0] = *Iptr;
+        });
+      });
+    }
+
+    s::cl_float r1 = r.x();
+    s::cl_float r2 = r.y();
+    s::cl_int i1 = i.x();
+    s::cl_int i2 = i.y();
+
+    assert(r1 > 12.8017f && r1 < 12.8019f); // ~12.8018
+    assert(r2 > 0.1024f && r2 < 0.1026f);   // ~0.102583
+    assert(i1 == 1);                        // tgamma of 10 is ~362880.0
+    assert(i2 == -1); // tgamma of -2.4 is ~-1.1080299470333461
+  }
+
+  return 0;
+}
diff --git a/SYCL/Basic/built-ins/vector_relational.cpp b/SYCL/Basic/built-ins/vector_relational.cpp
new file mode 100644
index 0000000000..de87a0e67a
--- /dev/null
+++ b/SYCL/Basic/built-ins/vector_relational.cpp
@@ -0,0 +1,608 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// TODO: ptxas fatal   : Ptx assembly aborted due to errors
+// XFAIL: cuda
+
+#include <CL/sycl.hpp>
+
+#include <iostream>
+#include <cassert>
+#include <cmath>
+
+namespace s = cl::sycl;
+
+int main() {
+  // isequal
+  {
+    s::cl_int4 r{ 0 };
+    {
+      s::buffer<s::cl_int4, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class isequalF4F4>([=]() {
+          AccR[0] = s::isequal(s::cl_float4{ 0.5f, 0.6f, NAN, INFINITY },
+                               s::cl_float4{ 0.5f, 0.5f, 0.5f, 0.5f });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    s::cl_int r3 = r.z();
+    s::cl_int r4 = r.w();
+
+    assert(r1 == -1);
+    assert(r2 == 0);
+    assert(r3 == 0);
+    assert(r4 == 0);
+  }
+
+  // isnotequal
+  {
+    s::cl_int4 r{ 0 };
+    {
+      s::buffer<s::cl_int4, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class isnotequalF4F4>([=]() {
+          AccR[0] = s::isnotequal(s::cl_float4{ 0.5f, 0.6f, NAN, INFINITY },
+                                  s::cl_float4{ 0.5f, 0.5f, 0.5f, 0.5f });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    s::cl_int r3 = r.z();
+    s::cl_int r4 = r.w();
+
+    assert(r1 == 0);
+    assert(r2 == -1);
+    assert(r3 == -1);
+    assert(r4 == -1);
+  }
+
+  // isgreater
+  {
+    s::cl_int4 r{ 0 };
+    {
+      s::buffer<s::cl_int4, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class isgreaterF4F4>([=]() {
+          AccR[0] = s::isgreater(s::cl_float4{ 0.5f, 0.6f, NAN, INFINITY },
+                                 s::cl_float4{ 0.5f, 0.5f, 0.5f, 0.5f });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    s::cl_int r3 = r.z();
+    s::cl_int r4 = r.w();
+
+    assert(r1 == 0);
+    assert(r2 == -1);
+    assert(r3 == 0);
+    assert(r4 == -1);
+  }
+
+  // isgreaterequal
+  {
+    s::cl_int4 r{ 0 };
+    {
+      s::buffer<s::cl_int4, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class isgreaterequalF4F4>([=]() {
+          AccR[0] = s::isgreaterequal(s::cl_float4{ 0.5f, 0.6f, NAN, INFINITY },
+                                      s::cl_float4{ 0.5f, 0.5f, 0.5f, 0.5f });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    s::cl_int r3 = r.z();
+    s::cl_int r4 = r.w();
+
+    assert(r1 == -1);
+    assert(r2 == -1);
+    assert(r3 == 0);
+    assert(r4 == -1);
+  }
+
+  // isless
+  {
+    s::cl_int4 r{ 0 };
+    {
+      s::buffer<s::cl_int4, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class islessF4F4>([=]() {
+          AccR[0] = s::isless(s::cl_float4{ 0.5f, 0.4f, NAN, INFINITY },
+                              s::cl_float4{ 0.5f, 0.5f, 0.5f, 0.5f });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    s::cl_int r3 = r.z();
+    s::cl_int r4 = r.w();
+
+    assert(r1 == 0);
+    assert(r2 == -1);
+    assert(r3 == 0);
+    assert(r4 == 0);
+  }
+
+  // islessequal
+  {
+    s::cl_int4 r{ 0 };
+    {
+      s::buffer<s::cl_int4, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class islessequalF4F4>([=]() {
+          AccR[0] = s::islessequal(s::cl_float4{ 0.5f, 0.4f, NAN, INFINITY },
+                                   s::cl_float4{ 0.5f, 0.5f, 0.5f, 0.5f });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    s::cl_int r3 = r.z();
+    s::cl_int r4 = r.w();
+
+    assert(r1 == -1);
+    assert(r2 == -1);
+    assert(r3 == 0);
+    assert(r4 == 0);
+  }
+
+  // islessgreater
+  {
+    s::cl_int4 r{ 0 };
+    {
+      s::buffer<s::cl_int4, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class islessgreaterF4F4>([=]() {
+          AccR[0] =
+              s::islessgreater(s::cl_float4{ 0.5f, 0.4f, NAN, INFINITY },
+                               s::cl_float4{ 0.5f, 0.5f, 0.5f, INFINITY });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    s::cl_int r3 = r.z();
+    s::cl_int r4 = r.w();
+
+    assert(r1 == 0);
+    assert(r2 == -1);
+    assert(r3 == 0);
+    assert(r4 == 0); // Infinity is considered as greater than any
+                     // other value except Infinity.
+  }
+
+  // isfinite
+  {
+    s::cl_int4 r{ 0 };
+    {
+      s::buffer<s::cl_int4, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class isfiniteF4F4>([=]() {
+          AccR[0] = s::isfinite(s::cl_float4{ 0.5f, 0.4f, NAN, INFINITY });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    s::cl_int r3 = r.z();
+    s::cl_int r4 = r.w();
+
+    assert(r1 == -1);
+    assert(r2 == -1);
+    assert(r3 == 0);
+    assert(r4 == 0);
+  }
+
+  // isinf
+  {
+    s::cl_int4 r{ 0 };
+    {
+      s::buffer<s::cl_int4, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class isinfF4F4>([=]() {
+          AccR[0] = s::isinf(s::cl_float4{ 0.5f, 0.4f, NAN, INFINITY });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    s::cl_int r3 = r.z();
+    s::cl_int r4 = r.w();
+
+    assert(r1 == 0);
+    assert(r2 == 0);
+    assert(r3 == 0);
+    assert(r4 == -1);
+  }
+
+  // isnan
+  {
+    s::cl_int4 r{ 0 };
+    {
+      s::buffer<s::cl_int4, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class isnanF4F4>([=]() {
+          AccR[0] = s::isnan(s::cl_float4{ 0.5f, 0.4f, NAN, INFINITY });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    s::cl_int r3 = r.z();
+    s::cl_int r4 = r.w();
+
+    assert(r1 == 0);
+    assert(r2 == 0);
+    assert(r3 == -1);
+    assert(r4 == 0);
+  }
+
+  // isnormal
+  {
+    s::cl_int4 r{ 0 };
+    {
+      s::buffer<s::cl_int4, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class isnormalF4F4>([=]() {
+          AccR[0] = s::isnormal(s::cl_float4{ 0.5f, 0.4f, NAN, INFINITY });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    s::cl_int r3 = r.z();
+    s::cl_int r4 = r.w();
+
+    assert(r1 == -1);
+    assert(r2 == -1);
+    assert(r3 == 0);
+    assert(r4 == 0);
+  }
+
+  // isordered
+  {
+    s::cl_int4 r{ 0 };
+    {
+      s::buffer<s::cl_int4, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class isorderedF4F4>([=]() {
+          AccR[0] = s::isordered(s::cl_float4{ 0.5f, 0.6f, NAN, INFINITY },
+                                 s::cl_float4{ 0.5f, 0.5f, 0.5f, 0.5f });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    s::cl_int r3 = r.z();
+    s::cl_int r4 = r.w();
+
+    assert(r1 == -1);
+    assert(r2 == -1);
+    assert(r3 == 0);
+    assert(r4 == -1); // infinity is ordered.
+  }
+
+  // isunordered
+  {
+    s::cl_int4 r{ 0 };
+    {
+      s::buffer<s::cl_int4, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class isunorderedF4F4>([=]() {
+          AccR[0] = s::isunordered(s::cl_float4{ 0.5f, 0.6f, NAN, INFINITY },
+                                   s::cl_float4{ 0.5f, 0.5f, 0.5f, 0.5f });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    s::cl_int r3 = r.z();
+    s::cl_int r4 = r.w();
+
+    assert(r1 == 0);
+    assert(r2 == 0);
+    assert(r3 == -1);
+    assert(r4 == 0);
+  }
+
+  // signbit
+  {
+    s::cl_int4 r{ 0 };
+    {
+      s::buffer<s::cl_int4, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class signbitF4>([=]() {
+          AccR[0] = s::signbit(s::cl_float4{ 0.5f, -12.0f, NAN, INFINITY });
+        });
+      });
+    }
+    s::cl_int r1 = r.x();
+    s::cl_int r2 = r.y();
+    s::cl_int r3 = r.z();
+    s::cl_int r4 = r.w();
+
+    assert(r1 == 0);
+    assert(r2 == -1);
+    assert(r3 == 0);
+    assert(r4 == 0);
+  }
+
+  // any.
+  // Call to the device function with vector parameters work. Scalars do not.
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class anyI4>([=]() {
+          AccR[0] = s::any(s::cl_int4{ -12, -12, 0, 1 });
+        });
+      });
+    }
+    s::cl_int r1 = r;
+
+    assert(r1 == 1);
+  }
+
+  // any.
+  // Call to the device function with vector parameters work. Scalars do not.
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class anyI4negative>([=]() {
+          AccR[0] = s::any(s::cl_int4{ -12, -12, -12, -12 });
+        });
+      });
+    }
+    s::cl_int r1 = r;
+
+    assert(r1 == 1);
+  }
+
+  // any.
+  // Call to the device function with vector parameters work. Scalars do not.
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class anyI4zero>([=]() {
+          AccR[0] = s::any(s::cl_int4{ 0, 0, 0, 0 });
+        });
+      });
+    }
+    s::cl_int r1 = r;
+
+    assert(r1 == 0);
+  }
+
+  // any.
+  // Call to the device function with vector parameters work. Scalars do not.
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class anyI4positive>([=]() {
+          AccR[0] = s::any(s::cl_int4{ 12, 12, 12, 12 });
+        });
+      });
+    }
+    s::cl_int r1 = r;
+
+    assert(r1 == 0);
+  }
+
+  // all.
+  // Call to the device function with vector parameters work. Scalars do not.
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class allI4>([=]() {
+          AccR[0] = s::all(s::cl_int4{ -12, -12, -12, -12 });
+          // Infinity (positive or negative) or Nan are not integers.
+          // Passing them creates inconsistent results between host and device
+          // execution.
+        });
+      });
+    }
+    s::cl_int r1 = r;
+
+    assert(r1 == 1);
+  }
+
+  // all.
+  // Call to the device function with vector parameters work. Scalars do not.
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class allI4negative>([=]() {
+          AccR[0] = s::all(s::cl_int4{ -12, -12, -12, -12 });
+        });
+      });
+    }
+    s::cl_int r1 = r;
+
+    assert(r1 == 1);
+  }
+
+  // all.
+  // Call to the device function with vector parameters work. Scalars do not.
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class allI4zero>([=]() {
+          AccR[0] = s::all(s::cl_int4{ 0, 0, 0, 0 });
+        });
+      });
+    }
+    s::cl_int r1 = r;
+
+    assert(r1 == 0);
+  }
+
+  // all.
+  // Call to the device function with vector parameters work. Scalars do not.
+  {
+    s::cl_int r{ 0 };
+    {
+      s::buffer<s::cl_int, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class allI4positive>([=]() {
+          AccR[0] = s::all(s::cl_int4{ 12, 12, 12, 12 });
+        });
+      });
+    }
+    s::cl_int r1 = r;
+
+    assert(r1 == 0);
+  }
+
+  // bitselect
+  {
+    s::cl_float4 r{ 0 };
+    {
+      s::buffer<s::cl_float4, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class bitselectF4F4F4>([=]() {
+          AccR[0] = s::bitselect(s::cl_float4{ 112.112, 12.12, 0, 0.0 },
+                                 s::cl_float4{ 34.34, 23.23, 1, 0.0 },
+                                 s::cl_float4{ 3.3, 6.6, 1, 0.0 });
+        }); // Using NAN/INFINITY as any float produced consistent results
+            // between host and device.
+      });
+    }
+    s::cl_float r1 = r.x();
+    s::cl_float r2 = r.y();
+    s::cl_float r3 = r.z();
+    s::cl_float r4 = r.w();
+
+    assert(abs(r1 - 80.5477f) < 0.0001);
+    assert(abs(r2 - 18.2322f) < 0.0001);
+    assert(abs(r3 - 1.0f) < 0.01);
+    assert(abs(r4 - 0.0f) < 0.01);
+  }
+
+  // select
+  {
+    s::cl_float4 r{ 0 };
+    {
+      s::buffer<s::cl_float4, 1> BufR(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class selectF4F4I4>([=]() {
+          AccR[0] =
+              s::select(s::cl_float4{ 112.112f, 34.34f, 112.112f, 34.34f },
+                        s::cl_float4{ 34.34f, 112.112f, 34.34f, 112.112f },
+                        s::cl_int4{ 0, -1, 0, 1 });
+          // Using NAN/infinity as an input, which gets
+          // selected by -1, produces a NAN/infinity as expected.
+        });
+      });
+    }
+    s::cl_float r1 = r.x();
+    s::cl_float r2 = r.y();
+    s::cl_float r3 = r.z();
+    s::cl_float r4 = r.w();
+
+    assert(r1 == 112.112f);
+    assert(r2 == 112.112f);
+    assert(r3 == 112.112f);
+    assert(r4 == 34.34f);
+  }
+
+  {
+    s::vec<int, 4> r(0);
+    {
+      s::vec<int, 4> a(1, 2, 3, 4);
+      s::vec<int, 4> b(5, 6, 7, 8);
+      s::vec<unsigned int, 4> m(1u, 0x80000000u, 42u, 0x80001000u);
+      s::buffer<s::vec<int, 4>> A(&a, s::range<1>(1));
+      s::buffer<s::vec<int, 4>> B(&b, s::range<1>(1));
+      s::buffer<s::vec<unsigned int, 4>> M(&m, s::range<1>(1));
+      s::buffer<s::vec<int, 4>> R(&r, s::range<1>(1));
+      s::queue myQueue;
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccA = A.get_access<s::access::mode::read>(cgh);
+        auto AccB = B.get_access<s::access::mode::read>(cgh);
+        auto AccM = M.get_access<s::access::mode::read>(cgh);
+        auto AccR = R.get_access<s::access::mode::write>(cgh);
+        cgh.single_task<class selectI4I4U4>([=]() {
+          AccR[0] = s::select(AccA[0], AccB[0], AccM[0]);
+        });
+      });
+    }
+    if (r.x() != 1 || r.y() != 6 || r.z() != 3 || r.w() != 8) {
+      std::cerr << "selectI4I4U4 test case failed!\n";
+      std::cerr << "Expected result: 1 6 3 8\n";
+      std::cerr << "Got: " << r.x() << " " << r.y() << " " << r.z() << " "
+                << r.w() << "\n";
+      return 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/SYCL/Basic/config/allowlist.cpp b/SYCL/Basic/config/allowlist.cpp
new file mode 100644
index 0000000000..a56185c25a
--- /dev/null
+++ b/SYCL/Basic/config/allowlist.cpp
@@ -0,0 +1,90 @@
+// REQUIRES: cpu
+// RUN: %clangxx -fsycl %s -o %t.out
+//
+// RUN: env PRINT_DEVICE_INFO=1 %t.out > %t1.conf
+// RUN: env TEST_DEVICE_AVAILABLE=1 env SYCL_CONFIG_FILE_NAME=%t1.conf %t.out
+//
+// RUN: env PRINT_PLATFORM_INFO=1 %t.out > %t2.conf
+// RUN: env TEST_DEVICE_AVAILABLE=1 env SYCL_CONFIG_FILE_NAME=%t2.conf %t.out
+//
+// RUN: env TEST_DEVICE_IS_NOT_AVAILABLE=1 env SYCL_DEVICE_ALLOWLIST="PlatformName:{{SUCH NAME DOESN'T EXIST}}" %t.out
+
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <cstdlib>
+#include <exception>
+#include <string>
+
+using namespace cl;
+
+static void replaceSpecialCharacters(std::string &Str) {
+  // Replace common special symbols with '.' which matches to any character
+  std::replace_if(Str.begin(), Str.end(),
+                  [](const char Sym) { return '(' == Sym || ')' == Sym; }, '.');
+}
+
+int main() {
+
+  // Expected that the allowlist filter is not set
+  if (getenv("PRINT_PLATFORM_INFO")) {
+    for (const sycl::platform &Platform : sycl::platform::get_platforms())
+      if (!Platform.is_host()) {
+
+        std::string Name = Platform.get_info<sycl::info::platform::name>();
+        std::string Ver = Platform.get_info<sycl::info::platform::version>();
+        // As a string will be used as regexp pattern, we need to get rid of
+        // symbols that can be treated in a special way.
+        replaceSpecialCharacters(Name);
+        replaceSpecialCharacters(Ver);
+
+        std::cout << "SYCL_DEVICE_ALLOWLIST=PlatformName:{{" << Name
+                  << "}},PlatformVersion:{{" << Ver << "}}";
+
+        return 0;
+      }
+    throw std::runtime_error("Non host device is not found");
+  }
+
+  // Expected that the allowlist filter is not set
+  if (getenv("PRINT_DEVICE_INFO")) {
+    for (const sycl::platform &Platform : sycl::platform::get_platforms())
+      if (!Platform.is_host()) {
+        const sycl::device Dev = Platform.get_devices().at(0);
+        std::string Name = Dev.get_info<sycl::info::device::name>();
+        std::string Ver = Dev.get_info<sycl::info::device::driver_version>();
+
+        // As a string will be used as regexp pattern, we need to get rid of
+        // symbols that can be treated in a special way.
+        replaceSpecialCharacters(Name);
+        replaceSpecialCharacters(Ver);
+
+        std::cout << "SYCL_DEVICE_ALLOWLIST=DeviceName:{{" << Name
+                  << "}},DriverVersion:{{" << Ver << "}}";
+
+        return 0;
+      }
+    throw std::runtime_error("Non host device is not found");
+  }
+
+  // Expected the allowlist to be set with the "PRINT_DEVICE_INFO" run result
+  if (getenv("TEST_DEVICE_AVAILABLE")) {
+    for (const sycl::platform &Platform : sycl::platform::get_platforms())
+      if (!Platform.is_host()) {
+        if (Platform.get_devices().size() != 1)
+          throw std::runtime_error("Expected only one non host device.");
+
+        return 0;
+      }
+    throw std::runtime_error("Non host device is not found");
+  }
+
+  // Expected the allowlist to be set but empty
+  if (getenv("TEST_DEVICE_IS_NOT_AVAILABLE")) {
+    for (const sycl::platform &Platform : sycl::platform::get_platforms())
+      if (!Platform.is_host())
+        throw std::runtime_error("Expected no non host device is available");
+    return 0;
+  }
+
+  throw std::runtime_error("Unhandled situation");
+}
diff --git a/SYCL/Basic/config/config.cpp b/SYCL/Basic/config/config.cpp
new file mode 100644
index 0000000000..d66e0392c6
--- /dev/null
+++ b/SYCL/Basic/config/config.cpp
@@ -0,0 +1,26 @@
+//==---- config.cpp --------------------------------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// RUN: %clangxx -g -O0 -fsycl %s -o %t.out
+// RUN: echo "SYCL_PRINT_EXECUTION_GRAPH=always" > %t.cfg
+// RUN: env SYCL_CONFIG_FILE_NAME=%t.cfg %t.out
+// RUN: ls | grep dot
+// RUN: rm *.dot
+// RUN: env SYCL_PRINT_EXECUTION_GRAPH=always %t.out
+// RUN: ls | grep dot
+// RUN: rm *.dot
+// RUN: %t.out
+// RUN: ls | not grep dot
+
+#include <CL/sycl.hpp>
+
+using namespace cl;
+
+int main() {
+  sycl::buffer<int, 1> Buf(sycl::range<1>{1});
+  auto Acc = Buf.get_access<sycl::access::mode::read>();
+}
diff --git a/SYCL/Basic/device-code-split/Inputs/split-per-source-second-file.cpp b/SYCL/Basic/device-code-split/Inputs/split-per-source-second-file.cpp
new file mode 100644
index 0000000000..daa2258763
--- /dev/null
+++ b/SYCL/Basic/device-code-split/Inputs/split-per-source-second-file.cpp
@@ -0,0 +1,21 @@
+#include "split-per-source.h"
+
+void runKernelsFromFile2() {
+  cl::sycl::queue Q;
+  int Data = 0;
+  {
+    cl::sycl::program Prg(Q.get_context());
+    cl::sycl::buffer<int, 1> Buf(&Data, cl::sycl::range<1>(1));
+    Prg.build_with_kernel_type<File2Kern1>();
+    cl::sycl::kernel Krn = Prg.get_kernel<File2Kern1>();
+
+    assert(!Prg.has_kernel<File1Kern1>());
+    assert(!Prg.has_kernel<File1Kern2>());
+
+    Q.submit([&](cl::sycl::handler &Cgh) {
+      auto Acc = Buf.get_access<cl::sycl::access::mode::read_write>(Cgh);
+      Cgh.single_task<File2Kern1>(Krn, [=]() { Acc[0] = 3; });
+    });
+  }
+  assert(Data == 3);
+}
diff --git a/SYCL/Basic/device-code-split/Inputs/split-per-source.h b/SYCL/Basic/device-code-split/Inputs/split-per-source.h
new file mode 100644
index 0000000000..fdb2dd4045
--- /dev/null
+++ b/SYCL/Basic/device-code-split/Inputs/split-per-source.h
@@ -0,0 +1,7 @@
+#include <CL/sycl.hpp>
+
+class File1Kern1;
+class File1Kern2;
+class File2Kern1;
+
+void runKernelsFromFile2();
diff --git a/SYCL/Basic/device-code-split/aot-accelerator.cpp b/SYCL/Basic/device-code-split/aot-accelerator.cpp
new file mode 100644
index 0000000000..823c647ad1
--- /dev/null
+++ b/SYCL/Basic/device-code-split/aot-accelerator.cpp
@@ -0,0 +1,5 @@
+// REQUIRES: aoc, accelerator
+
+// RUN: %clangxx -fsycl -fsycl-device-code-split=per_source -fsycl-targets=spir64_fpga-unknown-unknown-sycldevice -I %S/Inputs -o %t.out %S/split-per-source-main.cpp %S/Inputs/split-per-source-second-file.cpp
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// XFAIL: *
diff --git a/SYCL/Basic/device-code-split/aot-cpu.cpp b/SYCL/Basic/device-code-split/aot-cpu.cpp
new file mode 100644
index 0000000000..78cd5df05d
--- /dev/null
+++ b/SYCL/Basic/device-code-split/aot-cpu.cpp
@@ -0,0 +1,4 @@
+// REQUIRES: opencl-aot, cpu
+
+// RUN: %clangxx -fsycl -fsycl-device-code-split=per_source -fsycl-targets=spir64_x86_64-unknown-unknown-sycldevice -I %S/Inputs -o %t.out %S/split-per-source-main.cpp %S/Inputs/split-per-source-second-file.cpp
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
diff --git a/SYCL/Basic/device-code-split/aot-gpu.cpp b/SYCL/Basic/device-code-split/aot-gpu.cpp
new file mode 100644
index 0000000000..75c8aa15f6
--- /dev/null
+++ b/SYCL/Basic/device-code-split/aot-gpu.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: ocloc, gpu
+// UNSUPPORTED: cuda
+// CUDA does neither support device code splitting nor SPIR.
+//
+// RUN: %clangxx -fsycl -fsycl-device-code-split=per_source \
+// RUN:   -fsycl-targets=spir64_gen-unknown-unknown-sycldevice \
+// RUN:   -Xsycl-target-backend=spir64_gen-unknown-unknown-sycldevice \
+// RUN:   "-device skl" -I %S/Inputs -o %t.out \
+// RUN:   %S/split-per-source-main.cpp \
+// RUN:   %S/Inputs/split-per-source-second-file.cpp
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
diff --git a/SYCL/Basic/device-code-split/split-per-kernel.cpp b/SYCL/Basic/device-code-split/split-per-kernel.cpp
new file mode 100644
index 0000000000..f63e521b87
--- /dev/null
+++ b/SYCL/Basic/device-code-split/split-per-kernel.cpp
@@ -0,0 +1,68 @@
+// UNSUPPORTED: cuda
+// CUDA does not support device code splitting.
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-device-code-split=per_kernel -o %t.out %s
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// REQUIRES: cpu, gpu, accelerator
+
+#include <CL/sycl.hpp>
+
+class Kern1;
+class Kern2;
+class Kern3;
+
+int main() {
+  cl::sycl::queue Q;
+  int Data = 0;
+  {
+    cl::sycl::buffer<int, 1> Buf(&Data, cl::sycl::range<1>(1));
+    cl::sycl::program Prg(Q.get_context());
+    Prg.build_with_kernel_type<Kern1>();
+    cl::sycl::kernel Krn = Prg.get_kernel<Kern1>();
+
+    assert(!Prg.has_kernel<Kern2>());
+    assert(!Prg.has_kernel<Kern3>());
+
+    Q.submit([&](cl::sycl::handler &Cgh) {
+      auto Acc = Buf.get_access<cl::sycl::access::mode::read_write>(Cgh);
+      Cgh.single_task<Kern1>(Krn, [=]() { Acc[0] = 1; });
+    });
+  }
+  assert(Data == 1);
+
+  {
+    cl::sycl::buffer<int, 1> Buf(&Data, cl::sycl::range<1>(1));
+    cl::sycl::program Prg(Q.get_context());
+    Prg.build_with_kernel_type<Kern2>();
+    cl::sycl::kernel Krn = Prg.get_kernel<Kern2>();
+
+    assert(!Prg.has_kernel<Kern1>());
+    assert(!Prg.has_kernel<Kern3>());
+
+    Q.submit([&](cl::sycl::handler &Cgh) {
+      auto Acc = Buf.get_access<cl::sycl::access::mode::read_write>(Cgh);
+      Cgh.single_task<Kern2>(Krn, [=]() { Acc[0] = 2; });
+    });
+  }
+  assert(Data == 2);
+
+  {
+    cl::sycl::buffer<int, 1> Buf(&Data, cl::sycl::range<1>(1));
+    cl::sycl::program Prg(Q.get_context());
+    Prg.build_with_kernel_type<Kern3>();
+    cl::sycl::kernel Krn = Prg.get_kernel<Kern3>();
+
+    assert(!Prg.has_kernel<Kern1>());
+    assert(!Prg.has_kernel<Kern2>());
+
+    Q.submit([&](cl::sycl::handler &Cgh) {
+      auto Acc = Buf.get_access<cl::sycl::access::mode::read_write>(Cgh);
+      Cgh.single_task<Kern3>(Krn, [=]() { Acc[0] = 3; });
+    });
+  }
+  assert(Data == 3);
+
+  return 0;
+}
diff --git a/SYCL/Basic/device-code-split/split-per-source-main.cpp b/SYCL/Basic/device-code-split/split-per-source-main.cpp
new file mode 100644
index 0000000000..e418451550
--- /dev/null
+++ b/SYCL/Basic/device-code-split/split-per-source-main.cpp
@@ -0,0 +1,54 @@
+// UNSUPPORTED: cuda
+// CUDA does not support device code splitting.
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-device-code-split=per_source -I %S/Inputs -o %t.out %s %S/Inputs/split-per-source-second-file.cpp
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// REQUIRES: cpu, gpu, accelerator
+
+#include "Inputs/split-per-source.h"
+
+int main () {
+  cl::sycl::queue Q;
+  int Data = 0;
+  {
+    cl::sycl::buffer<int, 1> Buf(&Data, cl::sycl::range<1>(1));
+    cl::sycl::program Prg(Q.get_context());
+    Prg.build_with_kernel_type<File1Kern1>();
+    cl::sycl::kernel Krn = Prg.get_kernel<File1Kern1>();
+
+    assert(Prg.has_kernel<File1Kern2>());
+    // TODO uncomment once the KernelInfo in multiple translation units
+    // bug is fixed.
+    // assert(!Prg.has_kernel<File2Kern1>());
+
+    Q.submit([&](cl::sycl::handler &Cgh) {
+      auto Acc = Buf.get_access<cl::sycl::access::mode::read_write>(Cgh);
+      Cgh.single_task<File1Kern1>(/*Krn,*/ [=]() { Acc[0] = 1; });
+    });
+  }
+  assert(Data == 1);
+
+  {
+    cl::sycl::buffer<int, 1> Buf(&Data, cl::sycl::range<1>(1));
+    cl::sycl::program Prg(Q.get_context());
+    Prg.build_with_kernel_type<File1Kern2>();
+    cl::sycl::kernel Krn = Prg.get_kernel<File1Kern2>();
+
+    assert(Prg.has_kernel<File1Kern1>());
+    // TODO uncomment once the KernelInfo in multiple translation units
+    // bug is fixed.
+    // assert(!Prg.has_kernel<File2Kern1>());
+
+    Q.submit([&](cl::sycl::handler &Cgh) {
+      auto Acc = Buf.get_access<cl::sycl::access::mode::read_write>(Cgh);
+      Cgh.single_task<File1Kern2>(/*Krn,*/ [=]() { Acc[0] = 2; });
+    });
+  }
+  assert(Data == 2);
+
+  runKernelsFromFile2();
+
+  return 0;
+}
diff --git a/SYCL/Basic/devicelib/assert-windows.cpp b/SYCL/Basic/devicelib/assert-windows.cpp
new file mode 100644
index 0000000000..67f8830523
--- /dev/null
+++ b/SYCL/Basic/devicelib/assert-windows.cpp
@@ -0,0 +1,75 @@
+// REQUIRES: cpu,windows
+//
+// RUN: %clangxx -fsycl -c %s -o %t.o
+// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/../bin/libsycl-msvc.o -o %t.out
+//
+// MSVC implementation of assert does not call an unreachable built-in, so the
+// program doesn't terminate when fallback is used.
+//
+// FIXME: SPIR-V Unreachable should be called from the fallback
+// explicitly. Since the test is going to crash, we'll have to follow a similar
+// approach as on Linux - call the test in a subprocess.
+//
+// RUN: env SYCL_PI_TRACE=1 SYCL_DEVICELIB_INHIBIT_NATIVE=1 CL_CONFIG_USE_VECTORIZER=False SYCL_DEVICE_TYPE=CPU %t.out >%t.stdout.pi.fallback
+// RUN: env SHOULD_CRASH=1 SYCL_DEVICELIB_INHIBIT_NATIVE=1 CL_CONFIG_USE_VECTORIZER=False SYCL_DEVICE_TYPE=CPU %t.out >%t.stdout.msg.fallback
+//
+// RUN: FileCheck %s --check-prefix=CHECK-MESSAGE --input-file %t.stdout.msg.fallback
+// CHECK-MESSAGE: {{.*}}assert-windows.cpp:{{[0-9]+}}: (null): global id: [{{[0-3]}},0,0], local id: [{{[0-3]}},0,0] Assertion `accessorC[wiID] == 0 && "Invalid value"` failed.
+//
+// RUN: FileCheck %s --input-file %t.stdout.pi.fallback --check-prefix=CHECK-FALLBACK
+// CHECK-FALLBACK: ---> piProgramLink
+
+#include <CL/sycl.hpp>
+#include <array>
+#include <assert.h>
+
+using namespace cl::sycl;
+
+constexpr auto sycl_read = cl::sycl::access::mode::read;
+constexpr auto sycl_write = cl::sycl::access::mode::write;
+
+template <typename T, size_t N>
+void simple_vadd(const std::array<T, N> &VA, const std::array<T, N> &VB,
+                 std::array<T, N> &VC) {
+  queue deviceQueue([](cl::sycl::exception_list ExceptionList) {
+    for (cl::sycl::exception_ptr_class ExceptionPtr : ExceptionList) {
+      try {
+        std::rethrow_exception(ExceptionPtr);
+      } catch (cl::sycl::exception &E) {
+        std::cerr << E.what() << std::endl;
+      } catch (...) {
+        std::cerr << "Unknown async exception was caught." << std::endl;
+      }
+    }
+  });
+
+  int shouldCrash = getenv("SHOULD_CRASH") ? 1 : 0;
+
+  cl::sycl::range<1> numOfItems{N};
+  cl::sycl::buffer<T, 1> bufferA(VA.data(), numOfItems);
+  cl::sycl::buffer<T, 1> bufferB(VB.data(), numOfItems);
+  cl::sycl::buffer<T, 1> bufferC(VC.data(), numOfItems);
+
+  deviceQueue.submit([&](cl::sycl::handler &cgh) {
+    auto accessorA = bufferA.template get_access<sycl_read>(cgh);
+    auto accessorB = bufferB.template get_access<sycl_read>(cgh);
+    auto accessorC = bufferC.template get_access<sycl_write>(cgh);
+
+    cgh.parallel_for<class SimpleVaddT>(numOfItems, [=](cl::sycl::id<1> wiID) {
+      accessorC[wiID] = accessorA[wiID] + accessorB[wiID];
+      if (shouldCrash) {
+        assert(accessorC[wiID] == 0 && "Invalid value");
+      }
+    });
+  });
+  deviceQueue.wait_and_throw();
+}
+
+int main() {
+  std::array<int, 3> A = {1, 2, 3};
+  std::array<int, 3> B = {1, 2, 3};
+  std::array<int, 3> C = {0, 0, 0};
+
+  simple_vadd(A, B, C);
+  return EXIT_SUCCESS;
+}
diff --git a/SYCL/Basic/devicelib/assert.cpp b/SYCL/Basic/devicelib/assert.cpp
new file mode 100644
index 0000000000..343d949b74
--- /dev/null
+++ b/SYCL/Basic/devicelib/assert.cpp
@@ -0,0 +1,215 @@
+// REQUIRES: cpu,linux
+// RUN: %clangxx -fsycl -c %s -o %t.o
+// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/libsycl-glibc.o -o %t.out
+// (see the other RUN lines below; it is a bit complicated)
+//
+// assert() call in device code guarantees nothing: on some devices it behaves
+// in the usual way and terminates the program. On other devices it can print an
+// error message and *continue* execution. Less capable devices can even ignore
+// an assert!
+//
+// This makes testing an assert() a bit difficult task, and we have to rely on
+// the implementation details to make sure that both "native" and "fallback"
+// implementations work as expected.
+//
+// This test works only on Intel OpenCL CPU implementation, which is known to
+// behave as follows:
+//
+//   Fallback mode (aka the best we can do by following the OpenCL spec):
+//     1. Assertion condition is printed to *stdout* by the OpenCL printf().
+//     2. Process (both host and device) is terminated by a SIGSEGV.
+//
+//   Native mode (same behavior as libc assert on CPU):
+//     1. Assertion condition is printed to *stderr*.
+//     2. Process (both host and device) is terminated by a SIGABRT.
+//
+// Other devices are "covered" by the assert-dummy.cpp test, which doesn't
+// verify anything except a successful compilation for a device.
+//
+// FIXME: assert-dummy.cpp is not implemented yet, so other devices are not
+// covered.
+//
+// How the test works:
+// -------------------
+//
+//   1. First we verify that a call sequence in SYCL Runtime is correct:
+//
+//      - in the fallback mode we have to link an additional library that
+//        provides a generic implementation of assert().
+//
+//      - in the native mode we don't link anything, and call clBuildProgram for
+//        a user program alone.
+//
+//   2. Then we test that there is actually a difference between the two
+//      modes. Since the CPU device is the only device that supports this
+//      extension natively, we catch the difference between the fallback and the
+//      native modes: SIGSEGV should occur in the fallback mode, SIGABRT in the
+//      native mode.
+//
+//      In order to check the signal we fork() and let the child die. Then we
+//      verify how it was terminated. EXPECTED_SIGNAL environment variable
+//      controls the expected result.
+//
+//   3. We also test that a message is printed to the corresponding fd: stdout
+//      for the fallback mode and stderr for the native mode. In the fallback
+//      mode the test process dies right after a call to the OpenCL printf(), so
+//      the message can still be buffered by stdio. We turn the bufferization
+//      off explicitly.
+//
+//   4. We want to check both compilation flow in (1) and the message in (3),
+//      but these messages can interleave and fail to match. To avoid this,
+//      first run with SYCL_PI_TRACE and collect a trace, and then with
+//      SHOULD_CRASH (without SYCL_PI_TRACE) to collect an error message.
+//
+// SYCL_DEVICELIB_INHIBIT_NATIVE=1 environment variable is used to force a mode
+// in SYCL Runtime, so it doesn't look into a device extensions list and always
+// link the fallback library.
+//
+//
+// We also skip the native test entirely (see SKIP_IF_NO_EXT), since the assert
+// extension is a new feature and may not be supported by the runtime used with
+// SYCL.
+//
+// Overall this sounds stable enough. What could possibly go wrong?
+//
+// RUN: env SYCL_PI_TRACE=2 SHOULD_CRASH=1 SYCL_DEVICE_TYPE=CPU EXPECTED_SIGNAL=SIGABRT SKIP_IF_NO_EXT=1 %t.out 2>%t.stderr.native >%t.stdout.native
+// RUN: FileCheck %s --input-file %t.stdout.native --check-prefixes=CHECK-NATIVE || FileCheck %s --input-file %t.stderr.native --check-prefix CHECK-NOTSUPPORTED
+// RUN: FileCheck %s --input-file %t.stderr.native --check-prefixes=CHECK-MESSAGE || FileCheck %s --input-file %t.stderr.native --check-prefix CHECK-NOTSUPPORTED
+//
+// RUN: env SYCL_PI_TRACE=2 SYCL_DEVICELIB_INHIBIT_NATIVE=cl_intel_devicelib_assert SYCL_DEVICE_TYPE=CPU  %t.out >%t.stdout.pi.fallback
+// RUN: env SYCL_DEVICELIB_INHIBIT_NATIVE=cl_intel_devicelib_assert SYCL_DEVICE_TYPE=CPU  %t.out >%t.stdout.msg.fallback
+// RUN: FileCheck %s --input-file %t.stdout.pi.fallback --check-prefixes=CHECK-FALLBACK
+// RUN: FileCheck %s --input-file %t.stdout.msg.fallback --check-prefixes=CHECK-MESSAGE
+//
+// CHECK-NATIVE:   ---> piProgramBuild
+// CHECK-FALLBACK: ---> piProgramLink
+//
+// Skip the test if the CPU RT doesn't support the extension yet:
+// CHECK-NOTSUPPORTED: Device has no support for cl_intel_devicelib_assert
+//
+// Anyway, the same message has to be printed for both the fallback and the
+// native modes (fallback prints to stdout, while native prints to stderr; we
+// already handled this difference in the RUN lines):
+//
+// CHECK-MESSAGE: {{.*}}assert.cpp:{{[0-9]+}}: auto simple_vadd(const std::array<int, 3UL> &, const std::array<int, 3UL> &, std::array<int, 3UL> &)::(anonymous class)::operator()(cl::sycl::handler &)::(anonymous class)::operator()(cl::sycl::id<1>) const: global id: [{{[0-3]}},0,0], local id: [{{[0-3]}},0,0] Assertion `accessorC[wiID] == 0 && "Invalid value"` failed.
+//
+// Note that the work-item that hits the assert first may vary, since the order
+// of execution is undefined. We catch only the first one (whatever id it is).
+
+#include <CL/sycl.hpp>
+#include <array>
+#include <assert.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+using namespace cl::sycl;
+
+constexpr auto sycl_read = cl::sycl::access::mode::read;
+constexpr auto sycl_write = cl::sycl::access::mode::write;
+
+const int EXIT_SKIP_TEST = 42;
+
+template <typename T, size_t N>
+void simple_vadd(const std::array<T, N> &VA, const std::array<T, N> &VB,
+                 std::array<T, N> &VC) {
+  queue deviceQueue([](cl::sycl::exception_list ExceptionList) {
+    for (cl::sycl::exception_ptr_class ExceptionPtr : ExceptionList) {
+      try {
+        std::rethrow_exception(ExceptionPtr);
+      } catch (cl::sycl::exception &E) {
+        std::cerr << E.what() << std::endl;
+      } catch (...) {
+        std::cerr << "Unknown async exception was caught." << std::endl;
+      }
+    }
+  });
+  device dev = deviceQueue.get_device();
+  bool unsupported = true;
+  for (auto &ext : dev.get_info<info::device::extensions>()) {
+    if (ext == "cl_intel_devicelib_assert") {
+      unsupported = false;
+    }
+  }
+  if (unsupported && getenv("SKIP_IF_NO_EXT")) {
+    fprintf(stderr, "Device has no support for cl_intel_devicelib_assert, "
+                    "skipping the test\n");
+    exit(EXIT_SKIP_TEST);
+  }
+
+
+  cl::sycl::range<1> numOfItems{N};
+  cl::sycl::buffer<T, 1> bufferA(VA.data(), numOfItems);
+  cl::sycl::buffer<T, 1> bufferB(VB.data(), numOfItems);
+  cl::sycl::buffer<T, 1> bufferC(VC.data(), numOfItems);
+
+  deviceQueue.submit([&](cl::sycl::handler &cgh) {
+    auto accessorA = bufferA.template get_access<sycl_read>(cgh);
+    auto accessorB = bufferB.template get_access<sycl_read>(cgh);
+    auto accessorC = bufferC.template get_access<sycl_write>(cgh);
+
+    cgh.parallel_for<class SimpleVaddT>(numOfItems, [=](cl::sycl::id<1> wiID) {
+      accessorC[wiID] = accessorA[wiID] + accessorB[wiID];
+        assert(accessorC[wiID] == 0 && "Invalid value");
+    });
+  });
+  deviceQueue.wait_and_throw();
+}
+
+int main() {
+  int child = fork();
+  if (child) {
+    int status = 0;
+    waitpid(child, &status, 0);
+    if (WIFEXITED(status) && WEXITSTATUS(status) == EXIT_SKIP_TEST) {
+      return 0;
+    }
+    if (getenv("SHOULD_CRASH")) {
+      if (!WIFSIGNALED(status)) {
+        fprintf(stderr, "error: process did not terminate by a signal\n");
+        return 1;
+      }
+    } else {
+      if (WIFSIGNALED(status)) {
+        fprintf(stderr, "error: process should not terminate\n");
+        return 1;
+      }
+      // We should not check anything if the child finished successful and this
+      // was expected.
+      return 0;
+    }
+    int sig = WTERMSIG(status);
+    int expected = 0;
+    if (const char *env = getenv("EXPECTED_SIGNAL")) {
+      if (0 == strcmp(env, "SIGABRT")) {
+        expected = SIGABRT;
+      } else if (0 == strcmp(env, "SIGSEGV")) {
+        expected = SIGSEGV;
+      }
+      if (!expected) {
+        fprintf(stderr, "EXPECTED_SIGNAL should be set to either \"SIGABRT\", "
+                        "or \"SIGSEGV\"!\n");
+        return 1;
+      }
+    }
+    if (sig != expected) {
+      fprintf(stderr, "error: expected signal %d, got %d\n", expected, sig);
+      return 1;
+    }
+    return 0;
+  }
+
+  // Turn the bufferization off to not loose the assert message if it is written
+  // to stdout.
+  if (setvbuf(stdout, NULL, _IONBF, 0)) {
+    perror("failed to turn off bufferization on stdout");
+    return 1;
+  }
+
+  std::array<int, 3> A = {1, 2, 3};
+  std::array<int, 3> B = {1, 2, 3};
+  std::array<int, 3> C = {0, 0, 0};
+
+  simple_vadd(A, B, C);
+}
diff --git a/SYCL/Basic/devicelib/c99_complex_math_fp64_test.cpp b/SYCL/Basic/devicelib/c99_complex_math_fp64_test.cpp
new file mode 100644
index 0000000000..c039025b11
--- /dev/null
+++ b/SYCL/Basic/devicelib/c99_complex_math_fp64_test.cpp
@@ -0,0 +1,256 @@
+// UNSUPPORTED: windows
+// RUN: %clangxx -fsycl -c %s -o %t.o
+// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/libsycl-complex-fp64.o -o %t.out
+#include <CL/sycl.hpp>
+#include <cassert>
+#include <complex.h>
+#include "math_utils.hpp"
+#ifndef CMPLX
+#define CMPLX(r, i) ((double __complex__){ (double)r, (double)i })
+#endif
+
+bool approx_equal_c99_cmplx(double __complex__ x, double __complex__ y) {
+  return approx_equal_fp(creal(x), creal(y)) && approx_equal_fp(cimag(x), cimag(y));
+}
+
+namespace s = cl::sycl;
+constexpr s::access::mode sycl_read = s::access::mode::read;
+constexpr s::access::mode sycl_write = s::access::mode::write;
+
+class DeviceComplexTimes;
+
+void device_c99_complex_times(s::queue &deviceQueue) {
+  double __complex__ buf_in3[4] = {CMPLX(0, 1), CMPLX(1, 1),
+                                   CMPLX(2, 3), CMPLX(4, 5)};
+  double __complex__ buf_in4[4] = {CMPLX(1, 1), CMPLX(2, 1),
+                                   CMPLX(2, 2), CMPLX(3, 4)};
+  double __complex__ buf_out2[4];
+
+  double __complex__ ref_results2[4] = {CMPLX(-1, 1),  CMPLX(1, 3),
+                                        CMPLX(-2, 10), CMPLX(-8, 31)};
+  s::range<1> numOfItems{4};
+  {
+  s::buffer<double __complex__, 1> buffer4(buf_in3, numOfItems);
+  s::buffer<double __complex__, 1> buffer5(buf_in4, numOfItems);
+  s::buffer<double __complex__, 1> buffer6(buf_out2, numOfItems);
+  deviceQueue.submit([&](s::handler &cgh) {
+    auto buf_in3_access = buffer4.get_access<sycl_read>(cgh);
+    auto buf_in4_access = buffer5.get_access<sycl_read>(cgh);
+    auto buf_out2_access = buffer6.get_access<sycl_write>(cgh);
+    cgh.parallel_for<class DeviceComplexTimes>(numOfItems, [=](s::id<1>WIid) {
+      buf_out2_access[WIid] = buf_in3_access[WIid] * buf_in4_access[WIid];
+    });
+  });
+  }
+
+  for (size_t idx = 0; idx < 4; ++idx) {
+    assert(approx_equal_c99_cmplx(buf_out2[idx], ref_results2[idx]));
+  }
+}
+
+class DeviceComplexDivides;
+
+void device_c99_complex_divides(s::queue &deviceQueue) {
+  double __complex__ buf_in3[8] = {CMPLX(-1, 1),  CMPLX(1, 3),
+                                   CMPLX(-2, 10), CMPLX(-8, 31),
+                                   CMPLX(4, 2), CMPLX(-1, 0),
+                                   CMPLX(0, 10), CMPLX(0 , 0)};
+  double __complex__ buf_in4[8] = {CMPLX(0, 1), CMPLX(1, 1),
+                                   CMPLX(2, 3), CMPLX(4, 5),
+                                   CMPLX(2, 0), CMPLX(0, 1),
+                                   CMPLX(0, 5), CMPLX(1, 0)};
+  double __complex__ ref_results2[8] = {CMPLX(1, 1), CMPLX(2, 1),
+                                        CMPLX(2, 2), CMPLX(3, 4),
+                                        CMPLX(2, 1), CMPLX(0, 1),
+                                        CMPLX(2, 0), CMPLX(0, 0)};
+  double __complex__ buf_out2[8];
+
+  s::range<1> numOfItems{8};
+  {
+  s::buffer<double __complex__, 1> buffer4(buf_in3, numOfItems);
+  s::buffer<double __complex__, 1> buffer5(buf_in4, numOfItems);
+  s::buffer<double __complex__, 1> buffer6(buf_out2, numOfItems);
+  deviceQueue.submit([&](s::handler &cgh) {
+    auto buf_in3_access = buffer4.get_access<sycl_read>(cgh);
+    auto buf_in4_access = buffer5.get_access<sycl_read>(cgh);
+    auto buf_out2_access = buffer6.get_access<sycl_write>(cgh);
+    cgh.parallel_for<class DeviceComplexDivides>(numOfItems, [=](s::id<1>WIid) {
+      buf_out2_access[WIid] = buf_in3_access[WIid] / buf_in4_access[WIid];
+    });
+  });
+  }
+
+  for (size_t idx = 0; idx < 8; ++idx) {
+    assert(approx_equal_c99_cmplx(buf_out2[idx], ref_results2[idx]));
+  }
+}
+
+class DeviceComplexSqrt;
+
+void device_c99_complex_sqrt(s::queue &deviceQueue) {
+  double __complex__ buf_in2[4] = {CMPLX(-1, 0), CMPLX(0, 2),
+                                   CMPLX(4, 0),  CMPLX(-5, 12)};
+  double __complex__ buf_out2[4];
+  double __complex__ ref_results2[4] = {CMPLX(0, 1), CMPLX(1, 1),
+                                        CMPLX(2, 0), CMPLX(2, 3)};
+  s::range<1> numOfItems{4};
+  {
+  s::buffer<double __complex__, 1> buffer3(buf_in2, numOfItems);
+  s::buffer<double __complex__, 1> buffer4(buf_out2, numOfItems);
+  deviceQueue.submit([&](s::handler &cgh) {
+    auto buf_in2_access = buffer3.get_access<sycl_read>(cgh);
+    auto buf_out2_access = buffer4.get_access<sycl_write>(cgh);
+    cgh.parallel_for<class DeviceComplexSqrt>(numOfItems, [=](s::id<1>WIid) {
+      buf_out2_access[WIid] = csqrt(buf_in2_access[WIid]);
+    });
+  });
+  }
+
+  for (size_t idx = 0; idx < 4; ++idx) {
+    assert(approx_equal_c99_cmplx(buf_out2[idx], ref_results2[idx]));
+  }
+}
+
+class DeviceComplexAbs;
+
+void device_c99_complex_abs(s::queue &deviceQueue) {
+  double __complex__ buf_in2[4] = {CMPLX(0, 0),  CMPLX(3, 4),
+                                   CMPLX(12, 5), CMPLX(INFINITY, 1)};
+  double buf_out2[4];
+  double ref_results2[4] = {0, 5, 13, INFINITY};
+  s::range<1> numOfItems{4};
+  {
+  s::buffer<double __complex__, 1> buffer3(buf_in2, numOfItems);
+  s::buffer<double, 1> buffer4(buf_out2, numOfItems);
+  deviceQueue.submit([&](s::handler &cgh) {
+    auto buf_in2_access = buffer3.get_access<sycl_read>(cgh);
+    auto buf_out2_access = buffer4.get_access<sycl_write>(cgh);
+    cgh.parallel_for<class DeviceComplexAbs>(numOfItems, [=](s::id<1>WIid) {
+      buf_out2_access[WIid] = cabs(buf_in2_access[WIid]);
+    });
+  });
+  }
+
+  for (size_t idx = 0; idx < 4; ++idx) {
+    assert(approx_equal_fp(buf_out2[idx], ref_results2[idx]));
+  }
+}
+
+class DeviceComplexExp;
+
+void device_c99_complex_exp(s::queue &deviceQueue) {
+  double __complex__ buf_in2[4] = {CMPLX(0, 0), CMPLX(0, M_PI_2),
+                                   CMPLX(0, M_PI), CMPLX(1, M_PI_2)};
+  double __complex__ buf_out2[4];
+  double __complex__ ref_results2[4] = {CMPLX(1, 0), CMPLX(0, 1),
+                                        CMPLX(-1, 0),CMPLX(0, M_E)};
+  s::range<1> numOfItems{4};
+  {
+  s::buffer<double __complex__, 1> buffer3(buf_in2, numOfItems);
+  s::buffer<double __complex__, 1> buffer4(buf_out2, numOfItems);
+  deviceQueue.submit([&](s::handler &cgh) {
+    auto buf_in2_access = buffer3.get_access<sycl_read>(cgh);
+    auto buf_out2_access = buffer4.get_access<sycl_write>(cgh);
+    cgh.parallel_for<class DeviceComplexExp>(numOfItems, [=](s::id<1>WIid) {
+      buf_out2_access[WIid] = cexp(buf_in2_access[WIid]);
+    });
+  });
+  }
+
+  for (size_t idx = 0; idx < 4; ++idx) {
+    assert(approx_equal_c99_cmplx(buf_out2[idx], ref_results2[idx]));
+  }
+}
+
+class DeviceComplexLog;
+
+void device_c99_complex_log(s::queue &deviceQueue) {
+  double __complex__ buf_in2[4] = {CMPLX(1, 0),  CMPLX(0, 1),
+                                   CMPLX(-1, 0), CMPLX(0, M_E)};
+  double __complex__ buf_out2[4];
+  double __complex__ ref_results2[4] = {CMPLX(0, 0), CMPLX(0, M_PI_2),
+                                        CMPLX(0, M_PI), CMPLX(1, M_PI_2)};
+  s::range<1> numOfItems{4};
+  {
+  s::buffer<double __complex__, 1> buffer3(buf_in2, numOfItems);
+  s::buffer<double __complex__, 1> buffer4(buf_out2, numOfItems);
+  deviceQueue.submit([&](s::handler &cgh) {
+    auto buf_in2_access = buffer3.get_access<sycl_read>(cgh);
+    auto buf_out2_access = buffer4.get_access<sycl_write>(cgh);
+    cgh.parallel_for<class DeviceComplexLog>(numOfItems, [=](s::id<1>WIid) {
+      buf_out2_access[WIid] = ::clog(buf_in2_access[WIid]);
+    });
+  });
+  }
+
+  for (size_t idx = 0; idx < 4; ++idx) {
+    assert(approx_equal_c99_cmplx(buf_out2[idx], ref_results2[idx]));
+  }
+}
+
+class DeviceComplexSin;
+
+void device_c99_complex_sin(s::queue &deviceQueue) {
+  double __complex__ buf_in2[2] = {CMPLX(0, 0), CMPLX(M_PI_2, 0)};
+  double __complex__ buf_out2[2];
+  double __complex__ ref_results2[2] = {CMPLX(0, 0), CMPLX(1, 0)};
+  s::range<1> numOfItems{2};
+  {
+  s::buffer<double __complex__, 1> buffer3(buf_in2, numOfItems);
+  s::buffer<double __complex__, 1> buffer4(buf_out2, numOfItems);
+  deviceQueue.submit([&](s::handler &cgh) {
+    auto buf_in2_access = buffer3.get_access<sycl_read>(cgh);
+    auto buf_out2_access = buffer4.get_access<sycl_write>(cgh);
+    cgh.parallel_for<class DeviceComplexSin>(numOfItems, [=](s::id<1>WIid) {
+      buf_out2_access[WIid] = csin(buf_in2_access[WIid]);
+    });
+  });
+  }
+
+  for (size_t idx = 0; idx < 2; ++idx) {
+    assert(approx_equal_c99_cmplx(buf_out2[idx], ref_results2[idx]));
+  }
+}
+
+class DeviceComplexCos;
+
+void device_c99_complex_cos(s::queue &deviceQueue) {
+  double __complex__ buf_in2[2] = {CMPLX(0, 0), CMPLX(M_PI, 0)};
+  double __complex__ buf_out2[2];
+  double __complex__ ref_results2[2] = {CMPLX(1, 0), CMPLX(-1, 0)};
+  s::range<1> numOfItems{2};
+  {
+  s::buffer<double __complex__, 1> buffer3(buf_in2, numOfItems);
+  s::buffer<double __complex__, 1> buffer4(buf_out2, numOfItems);
+  deviceQueue.submit([&](s::handler &cgh) {
+    auto buf_in2_access = buffer3.get_access<sycl_read>(cgh);
+    auto buf_out2_access = buffer4.get_access<sycl_write>(cgh);
+    cgh.parallel_for<class DeviceComplexCos>(numOfItems, [=](s::id<1>WIid) {
+      buf_out2_access[WIid] = ccos(buf_in2_access[WIid]);
+    });
+  });
+  }
+
+  for (size_t idx = 0; idx < 2; ++idx) {
+    assert(approx_equal_c99_cmplx(buf_out2[idx], ref_results2[idx]));
+  }
+}
+
+void device_c99_complex_test(s::queue &deviceQueue) {
+  device_c99_complex_times(deviceQueue);
+  device_c99_complex_divides(deviceQueue);
+  device_c99_complex_sqrt(deviceQueue);
+  device_c99_complex_abs(deviceQueue);
+  device_c99_complex_exp(deviceQueue);
+  device_c99_complex_log(deviceQueue);
+  device_c99_complex_sin(deviceQueue);
+  device_c99_complex_cos(deviceQueue);
+}
+
+int main() {
+  s::queue deviceQueue;
+  if (deviceQueue.get_device().has_extension("cl_khr_fp64")) {
+    device_c99_complex_test(deviceQueue);
+    std::cout << "Pass" << std::endl;
+  }
+}
diff --git a/SYCL/Basic/devicelib/c99_complex_math_test.cpp b/SYCL/Basic/devicelib/c99_complex_math_test.cpp
new file mode 100644
index 0000000000..704d80bd01
--- /dev/null
+++ b/SYCL/Basic/devicelib/c99_complex_math_test.cpp
@@ -0,0 +1,258 @@
+// UNSUPPORTED: windows
+// RUN: %clangxx -fsycl -c %s -o %t.o
+// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/libsycl-complex.o -o %t.out
+#include <CL/sycl.hpp>
+#include <cassert>
+#include <complex.h>
+#include "math_utils.hpp"
+
+#ifndef CMPLXF
+#define CMPLXF(r, i) ((float __complex__){ (float)r, (float)i })
+#endif
+
+bool approx_equal_c99_cmplxf(float __complex__ x, float __complex__ y) {
+  return approx_equal_fp(crealf(x), crealf(y)) && approx_equal_fp(cimagf(x), cimagf(y));
+}
+
+namespace s = cl::sycl;
+constexpr s::access::mode sycl_read = s::access::mode::read;
+constexpr s::access::mode sycl_write = s::access::mode::write;
+
+class DeviceComplexTimes;
+
+void device_c99_complex_times(s::queue &deviceQueue) {
+  float __complex__ buf_in1[4] = {CMPLXF(0, 1), CMPLXF(1, 1),
+                                  CMPLXF(2, 3), CMPLXF(4, 5)};
+  float __complex__ buf_in2[4] = {CMPLXF(1, 1), CMPLXF(2, 1),
+                                  CMPLXF(2, 2), CMPLXF(3, 4)};
+  float __complex__ buf_out1[4];
+
+  float __complex__ ref_results1[4] = {CMPLXF(-1, 1),  CMPLXF(1, 3),
+                                       CMPLXF(-2, 10), CMPLXF(-8, 31)};
+
+  s::range<1> numOfItems{4};
+  {
+  s::buffer<float __complex__, 1> buffer1(buf_in1, numOfItems);
+  s::buffer<float __complex__, 1> buffer2(buf_in2, numOfItems);
+  s::buffer<float __complex__, 1> buffer3(buf_out1, numOfItems);
+  deviceQueue.submit([&](s::handler &cgh) {
+    auto buf_in1_access = buffer1.get_access<sycl_read>(cgh);
+    auto buf_in2_access = buffer2.get_access<sycl_read>(cgh);
+    auto buf_out1_access = buffer3.get_access<sycl_write>(cgh);
+    cgh.parallel_for<class DeviceComplexTimes>(numOfItems, [=](s::id<1>WIid) {
+      buf_out1_access[WIid] = buf_in1_access[WIid] * buf_in2_access[WIid];
+    });
+  });
+  }
+
+  for (size_t idx = 0; idx < 4; ++idx) {
+    assert(approx_equal_c99_cmplxf(buf_out1[idx], ref_results1[idx]));
+  }
+}
+
+class DeviceComplexDivides;
+
+void device_c99_complex_divides(s::queue &deviceQueue) {
+  float __complex__ buf_in1[8] = {CMPLXF(-1, 1),  CMPLXF(1, 3),
+                                  CMPLXF(-2, 10), CMPLXF(-8, 31),
+                                  CMPLXF(4, 2), CMPLXF(-1, 0),
+                                  CMPLXF(0, 10), CMPLXF(0 , 0)};
+  float __complex__ buf_in2[8] = {CMPLXF(0, 1), CMPLXF(1, 1),
+                                  CMPLXF(2, 3), CMPLXF(4, 5),
+                                  CMPLXF(2, 0), CMPLXF(0, 1),
+                                  CMPLXF(0, 5), CMPLXF(1, 0)};
+  float __complex__ ref_results1[8] = {CMPLXF(1, 1), CMPLXF(2, 1),
+                                       CMPLXF(2, 2), CMPLXF(3, 4),
+                                       CMPLXF(2, 1), CMPLXF(0, 1),
+                                       CMPLXF(2, 0), CMPLXF(0, 0)};
+  float __complex__ buf_out1[8];
+
+  s::range<1> numOfItems{8};
+  {
+  s::buffer<float __complex__, 1> buffer1(buf_in1, numOfItems);
+  s::buffer<float __complex__, 1> buffer2(buf_in2, numOfItems);
+  s::buffer<float __complex__, 1> buffer3(buf_out1,numOfItems);
+  deviceQueue.submit([&](s::handler &cgh) {
+    auto buf_in1_access = buffer1.get_access<sycl_read>(cgh);
+    auto buf_in2_access = buffer2.get_access<sycl_read>(cgh);
+    auto buf_out1_access = buffer3.get_access<sycl_write>(cgh);
+    cgh.parallel_for<class DeviceComplexDivides>(numOfItems, [=](s::id<1>WIid) {
+      buf_out1_access[WIid] = buf_in1_access[WIid] / buf_in2_access[WIid];
+    });
+  });
+  }
+
+  for (size_t idx = 0; idx < 8; ++idx) {
+    assert(approx_equal_c99_cmplxf(buf_out1[idx], ref_results1[idx]));
+  }
+}
+
+class DeviceComplexSqrt;
+
+void device_c99_complex_sqrt(s::queue &deviceQueue) {
+  float __complex__ buf_in1[4] = {CMPLXF(-1, 0), CMPLXF(0, 2),
+                                 CMPLXF(4, 0),  CMPLXF(-5, 12)};
+  float __complex__ buf_out1[4];
+  float __complex__ ref_results1[4] = {CMPLXF(0, 1), CMPLXF(1, 1),
+                                       CMPLXF(2, 0), CMPLXF(2, 3)};
+
+  s::range<1> numOfItems{4};
+  {
+  s::buffer<float __complex__, 1> buffer1(buf_in1, numOfItems);
+  s::buffer<float __complex__, 1> buffer2(buf_out1, numOfItems);
+  deviceQueue.submit([&](s::handler &cgh) {
+    auto buf_in1_access = buffer1.get_access<sycl_read>(cgh);
+    auto buf_out1_access = buffer2.get_access<sycl_write>(cgh);
+    cgh.parallel_for<class DeviceComplexSqrt>(numOfItems, [=](s::id<1>WIid) {
+      buf_out1_access[WIid] = csqrtf(buf_in1_access[WIid]);
+    });
+  });
+  }
+
+  for (size_t idx = 0; idx < 4; ++idx) {
+    assert(approx_equal_c99_cmplxf(buf_out1[idx], ref_results1[idx]));
+  }
+}
+
+class DeviceComplexAbs;
+
+void device_c99_complex_abs(s::queue &deviceQueue) {
+  float __complex__ buf_in1[4] = {CMPLXF(0, 0),  CMPLXF(3, 4),
+                                  CMPLXF(12, 5), CMPLXF(INFINITY, 1)};
+  float buf_out1[4];
+  float ref_results1[4] = {0, 5, 13, INFINITY};
+
+  s::range<1> numOfItems{4};
+  {
+  s::buffer<float __complex__, 1> buffer1(buf_in1, numOfItems);
+  s::buffer<float, 1> buffer2(buf_out1, numOfItems);
+  deviceQueue.submit([&](s::handler &cgh) {
+    auto buf_in1_access = buffer1.get_access<sycl_read>(cgh);
+    auto buf_out1_access = buffer2.get_access<sycl_write>(cgh);
+    cgh.parallel_for<class DeviceComplexAbs>(numOfItems, [=](s::id<1>WIid) {
+      buf_out1_access[WIid] = cabsf(buf_in1_access[WIid]);
+    });
+  });
+  }
+
+  for (size_t idx = 0; idx < 4; ++idx) {
+    assert(approx_equal_fp(buf_out1[idx], ref_results1[idx]));
+  }
+}
+
+class DeviceComplexExp;
+
+void device_c99_complex_exp(s::queue &deviceQueue) {
+  float __complex__ buf_in1[4] = {CMPLXF(0, 0), CMPLXF(0, M_PI_2),
+                                 CMPLXF(0, M_PI), CMPLXF(1, M_PI_2)};
+  float __complex__ buf_out1[4];
+  float __complex__ ref_results1[4] = {CMPLXF(1, 0), CMPLXF(0, 1),
+                                       CMPLXF(-1, 0),CMPLXF(0, M_E)};
+  s::range<1> numOfItems{4};
+  {
+  s::buffer<float __complex__, 1> buffer1(buf_in1, numOfItems);
+  s::buffer<float __complex__, 1> buffer2(buf_out1, numOfItems);
+  deviceQueue.submit([&](s::handler &cgh) {
+    auto buf_in1_access = buffer1.get_access<sycl_read>(cgh);
+    auto buf_out1_access = buffer2.get_access<sycl_write>(cgh);
+    cgh.parallel_for<class DeviceComplexExp>(numOfItems, [=](s::id<1>WIid) {
+      buf_out1_access[WIid] = cexpf(buf_in1_access[WIid]);
+    });
+  });
+  }
+
+  for (size_t idx = 0; idx < 4; ++idx) {
+    assert(approx_equal_c99_cmplxf(buf_out1[idx], ref_results1[idx]));
+  }
+}
+
+class DeviceComplexLog;
+
+void device_c99_complex_log(s::queue &deviceQueue) {
+  float __complex__ buf_in1[4] = {CMPLXF(1, 0),  CMPLXF(0, 1),
+                                  CMPLXF(-1, 0), CMPLXF(0, M_E)};
+  float __complex__ buf_out1[4];
+  float __complex__ ref_results1[4] = {CMPLXF(0, 0), CMPLXF(0, M_PI_2),
+                                       CMPLXF(0, M_PI), CMPLXF(1, M_PI_2)};
+  s::range<1> numOfItems{4};
+  {
+  s::buffer<float __complex__, 1> buffer1(buf_in1, numOfItems);
+  s::buffer<float __complex__, 1> buffer2(buf_out1, numOfItems);
+  deviceQueue.submit([&](s::handler &cgh) {
+    auto buf_in1_access = buffer1.get_access<sycl_read>(cgh);
+    auto buf_out1_access = buffer2.get_access<sycl_write>(cgh);
+    cgh.parallel_for<class DeviceComplexLog>(numOfItems, [=](s::id<1>WIid) {
+      buf_out1_access[WIid] = clogf(buf_in1_access[WIid]);
+    });
+  });
+  }
+
+  for (size_t idx = 0; idx < 4; ++idx) {
+    assert(approx_equal_c99_cmplxf(buf_out1[idx], ref_results1[idx]));
+  }
+}
+
+class DeviceComplexSin;
+
+void device_c99_complex_sin(s::queue &deviceQueue) {
+  float __complex__ buf_in1[2] = {CMPLXF(0, 0), CMPLXF(M_PI_2, 0)};
+  float __complex__ buf_out1[2];
+  float __complex__ ref_results1[2] = {CMPLXF(0, 0), CMPLXF(1, 0)};
+  s::range<1> numOfItems{2};
+  {
+  s::buffer<float __complex__, 1> buffer1(buf_in1, numOfItems);
+  s::buffer<float __complex__, 1> buffer2(buf_out1, numOfItems);
+  deviceQueue.submit([&](s::handler &cgh) {
+    auto buf_in1_access = buffer1.get_access<sycl_read>(cgh);
+    auto buf_out1_access = buffer2.get_access<sycl_write>(cgh);
+    cgh.parallel_for<class DeviceComplexSin>(numOfItems, [=](s::id<1>WIid) {
+      buf_out1_access[WIid] = csinf(buf_in1_access[WIid]);
+    });
+  });
+  }
+
+  for (size_t idx = 0; idx < 2; ++idx) {
+    assert(approx_equal_c99_cmplxf(buf_out1[idx], ref_results1[idx]));
+  }
+}
+
+class DeviceComplexCos;
+
+void device_c99_complex_cos(s::queue &deviceQueue) {
+  float __complex__ buf_in1[2] = {CMPLXF(0, 0), CMPLXF(M_PI, 0)};
+  float __complex__ buf_out1[2];
+  float __complex__ ref_results1[2] = {CMPLXF(1, 0), CMPLXF(-1, 0)};
+  s::range<1> numOfItems{2};
+  {
+  s::buffer<float __complex__, 1> buffer1(buf_in1, numOfItems);
+  s::buffer<float __complex__, 1> buffer2(buf_out1, numOfItems);
+  deviceQueue.submit([&](s::handler &cgh) {
+    auto buf_in1_access = buffer1.get_access<sycl_read>(cgh);
+    auto buf_out1_access = buffer2.get_access<sycl_write>(cgh);
+    cgh.parallel_for<class DeviceComplexCos>(numOfItems, [=](s::id<1>WIid) {
+      buf_out1_access[WIid] = ccosf(buf_in1_access[WIid]);
+    });
+  });
+  }
+
+  for (size_t idx = 0; idx < 2; ++idx) {
+    assert(approx_equal_c99_cmplxf(buf_out1[idx], ref_results1[idx]));
+  }
+}
+
+void device_c99_complex_test(s::queue &deviceQueue) {
+  device_c99_complex_times(deviceQueue);
+  device_c99_complex_divides(deviceQueue);
+  device_c99_complex_sqrt(deviceQueue);
+  device_c99_complex_abs(deviceQueue);
+  device_c99_complex_exp(deviceQueue);
+  device_c99_complex_log(deviceQueue);
+  device_c99_complex_sin(deviceQueue);
+  device_c99_complex_cos(deviceQueue);
+}
+
+int main() {
+  s::queue deviceQueue;
+  device_c99_complex_test(deviceQueue);
+  std::cout << "Pass" << std::endl;
+}
diff --git a/SYCL/Basic/devicelib/cmath_fp64_test.cpp b/SYCL/Basic/devicelib/cmath_fp64_test.cpp
new file mode 100644
index 0000000000..27da0dd11c
--- /dev/null
+++ b/SYCL/Basic/devicelib/cmath_fp64_test.cpp
@@ -0,0 +1,118 @@
+// UNSUPPORTED: windows
+// RUN: %clangxx -fsycl -c %s -o %t.o
+// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/libsycl-cmath-fp64.o -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// REQUIRES: host, cpu, accelerator
+
+#include <CL/sycl.hpp>
+#include <cmath>
+#include <iostream>
+#include "math_utils.hpp"
+
+namespace s = cl::sycl;
+constexpr s::access::mode sycl_read = s::access::mode::read;
+constexpr s::access::mode sycl_write = s::access::mode::write;
+
+#define TEST_NUM 38
+
+double ref[TEST_NUM] = {
+1, 0, 0, 0, 0, 0, 0, 1, 1, 0.5,
+0, 2, 0, 0, 1, 0, 2, 0, 0, 0,
+0, 0, 1, 0, 1, 2, 0, 1, 2, 5,
+0, 0, 0, 0, 0.5, 0.5, NAN, NAN,};
+
+double refIptr = 1;
+
+template <class T>
+void device_cmath_test(s::queue &deviceQueue) {
+  s::range<1> numOfItems{TEST_NUM};
+  T result[TEST_NUM] = {-1};
+
+  // Variable exponent is an integer value to store the exponent in frexp function
+  int exponent = -1;
+
+  // Variable iptr stores the integral part of float point in modf function
+  T iptr = -1;
+
+  // Variable quo stores the sign and some bits of x/y in remquo function
+  int quo = -1;
+  {
+    s::buffer<T, 1> buffer1(result, numOfItems);
+    s::buffer<int, 1> buffer2(&exponent, s::range<1>{1});
+    s::buffer<T, 1> buffer3(&iptr, s::range<1>{1});
+    s::buffer<int, 1> buffer4(&quo, s::range<1>{1});
+    deviceQueue.submit([&](cl::sycl::handler &cgh) {
+      auto res_access = buffer1.template get_access<sycl_write>(cgh);
+      auto exp_access = buffer2.template get_access<sycl_write>(cgh);
+      auto iptr_access = buffer3.template get_access<sycl_write>(cgh);
+      auto quo_access = buffer4.template get_access<sycl_write>(cgh);
+      cgh.single_task<class DeviceMathTest>([=]() {
+        int i = 0;
+        res_access[i++] = std::cos(0.0);
+        res_access[i++] = std::sin(0.0);
+        res_access[i++] = std::log(1.0);
+        res_access[i++] = std::acos(1.0);
+        res_access[i++] = std::asin(0.0);
+        res_access[i++] = std::atan(0.0);
+        res_access[i++] = std::atan2(0.0, 1.0);
+        res_access[i++] = std::cosh(0.0);
+        res_access[i++] = std::exp(0.0);
+        res_access[i++] = std::fmod(1.5, 1.0);
+        res_access[i++] = std::frexp(0.0, &exp_access[0]);
+        res_access[i++] = std::ldexp(1.0, 1);
+        res_access[i++] = std::log10(1.0);
+        res_access[i++] = std::modf(1.0, &iptr_access[0]);
+        res_access[i++] = std::pow(1.0, 1.0);
+        res_access[i++] = std::sinh(0.0);
+        res_access[i++] = std::sqrt(4.0);
+        res_access[i++] = std::tan(0.0);
+        res_access[i++] = std::tanh(0.0);
+        res_access[i++] = std::acosh(1.0);
+        res_access[i++] = std::asinh(0.0);
+        res_access[i++] = std::atanh(0.0);
+        res_access[i++] = std::cbrt(1.0);
+        res_access[i++] = std::erf(0.0);
+        res_access[i++] = std::erfc(0.0);
+        res_access[i++] = std::exp2(1.0);
+        res_access[i++] = std::expm1(0.0);
+        res_access[i++] = std::fdim(1.0, 0.0);
+        res_access[i++] = std::fma(1.0, 1.0, 1.0);
+        res_access[i++] = std::hypot(3.0, 4.0);
+        res_access[i++] = std::ilogb(1.0);
+        res_access[i++] = std::log1p(0.0);
+        res_access[i++] = std::log2(1.0);
+        res_access[i++] = std::logb(1.0);
+        res_access[i++] = std::remainder(0.5, 1.0);
+        res_access[i++] = std::remquo(0.5, 1.0, &quo_access[0]);
+        T a = NAN;
+        res_access[i++] = std::tgamma(a);
+        res_access[i++] = std::lgamma(a);
+      });
+    });
+  }
+
+  // Compare result with reference
+  for (int i = 0; i < TEST_NUM; ++i) {
+    assert(approx_equal_fp(result[i], ref[i]));
+  }
+
+  // Test modf integral part
+  assert(approx_equal_fp(iptr, refIptr));
+
+  // Test frexp exponent
+  assert(exponent == 0);
+
+  // Test remquo sign
+  assert(quo == 0);
+}
+
+int main() {
+  s::queue deviceQueue;
+  if (deviceQueue.get_device().has_extension("cl_khr_fp64")) {
+    device_cmath_test<double>(deviceQueue);
+    std::cout << "Pass" << std::endl;
+  }
+  return 0;
+}
diff --git a/SYCL/Basic/devicelib/cmath_test.cpp b/SYCL/Basic/devicelib/cmath_test.cpp
new file mode 100644
index 0000000000..550830b543
--- /dev/null
+++ b/SYCL/Basic/devicelib/cmath_test.cpp
@@ -0,0 +1,115 @@
+// UNSUPPORTED: windows
+// RUN: %clangxx -fsycl -c %s -o %t.o
+// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/libsycl-cmath.o -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// REQUIRES: host, cpu, accelerator
+#include <CL/sycl.hpp>
+#include <cmath>
+#include <iostream>
+#include "math_utils.hpp"
+
+namespace s = cl::sycl;
+constexpr s::access::mode sycl_read = s::access::mode::read;
+constexpr s::access::mode sycl_write = s::access::mode::write;
+
+#define TEST_NUM 38
+
+float ref[TEST_NUM] = {
+1, 0, 0, 0, 0, 0, 0, 1, 1, 0.5,
+0, 2, 0, 0, 1, 0, 2, 0, 0, 0,
+0, 0, 1, 0, 1, 2, 0, 1, 2, 5,
+0, 0, 0, 0, 0.5, 0.5, NAN, NAN,};
+
+float refIptr = 1;
+
+template <class T>
+void device_cmath_test(s::queue &deviceQueue) {
+  s::range<1> numOfItems{TEST_NUM};
+  T result[TEST_NUM] = {-1};
+
+  // Variable exponent is an integer value to store the exponent in frexp function
+  int exponent = -1;
+
+  // Variable iptr stores the integral part of float point in modf function
+  T iptr = -1;
+
+  // Variable quo stores the sign and some bits of x/y in remquo function
+  int quo = -1;
+  {
+    s::buffer<T, 1> buffer1(result, numOfItems);
+    s::buffer<int, 1> buffer2(&exponent, s::range<1>{1});
+    s::buffer<T, 1> buffer3(&iptr, s::range<1>{1});
+    s::buffer<int, 1> buffer4(&quo, s::range<1>{1});
+    deviceQueue.submit([&](cl::sycl::handler &cgh) {
+      auto res_access = buffer1.template get_access<sycl_write>(cgh);
+      auto exp_access = buffer2.template get_access<sycl_write>(cgh);
+      auto iptr_access = buffer3.template get_access<sycl_write>(cgh);
+      auto quo_access = buffer4.template get_access<sycl_write>(cgh);
+      cgh.single_task<class DeviceMathTest>([=]() {
+        int i = 0;
+        res_access[i++] = std::cos(0.0f);
+        res_access[i++] = std::sin(0.0f);
+        res_access[i++] = std::log(1.0f);
+        res_access[i++] = std::acos(1.0f);
+        res_access[i++] = std::asin(0.0f);
+        res_access[i++] = std::atan(0.0f);
+        res_access[i++] = std::atan2(0.0f, 1.0f);
+        res_access[i++] = std::cosh(0.0f);
+        res_access[i++] = std::exp(0.0f);
+        res_access[i++] = std::fmod(1.5f, 1.0f);
+        res_access[i++] = std::frexp(0.0f, &exp_access[0]);
+        res_access[i++] = std::ldexp(1.0f, 1);
+        res_access[i++] = std::log10(1.0f);
+        res_access[i++] = std::modf(1.0f, &iptr_access[0]);
+        res_access[i++] = std::pow(1.0f, 1.0f);
+        res_access[i++] = std::sinh(0.0f);
+        res_access[i++] = std::sqrt(4.0f);
+        res_access[i++] = std::tan(0.0f);
+        res_access[i++] = std::tanh(0.0f);
+        res_access[i++] = std::acosh(1.0f);
+        res_access[i++] = std::asinh(0.0f);
+        res_access[i++] = std::atanh(0.0f);
+        res_access[i++] = std::cbrt(1.0f);
+        res_access[i++] = std::erf(0.0f);
+        res_access[i++] = std::erfc(0.0f);
+        res_access[i++] = std::exp2(1.0f);
+        res_access[i++] = std::expm1(0.0f);
+        res_access[i++] = std::fdim(1.0f, 0.0f);
+        res_access[i++] = std::fma(1.0f, 1.0f, 1.0f);
+        res_access[i++] = std::hypot(3.0f, 4.0f);
+        res_access[i++] = std::ilogb(1.0f);
+        res_access[i++] = std::log1p(0.0f);
+        res_access[i++] = std::log2(1.0f);
+        res_access[i++] = std::logb(1.0f);
+        res_access[i++] = std::remainder(0.5f, 1.0f);
+        res_access[i++] = std::remquo(0.5f, 1.0f, &quo_access[0]);
+        T a = NAN;
+        res_access[i++] = std::tgamma(a);
+        res_access[i++] = std::lgamma(a);
+      });
+    });
+  }
+
+  // Compare result with reference
+  for (int i = 0; i < TEST_NUM; ++i) {
+    assert(approx_equal_fp(result[i], ref[i]));
+  }
+
+  // Test modf integral part
+  assert(approx_equal_fp(iptr, refIptr));
+
+  // Test frexp exponent 
+  assert(exponent == 0);
+
+  // Test remquo sign
+  assert(quo == 0);
+}
+
+int main() {
+  s::queue deviceQueue;
+  device_cmath_test<float>(deviceQueue);
+  std::cout << "Pass" << std::endl;
+  return 0;
+}
diff --git a/SYCL/Basic/devicelib/math_fp64_test.cpp b/SYCL/Basic/devicelib/math_fp64_test.cpp
new file mode 100644
index 0000000000..4ba48151aa
--- /dev/null
+++ b/SYCL/Basic/devicelib/math_fp64_test.cpp
@@ -0,0 +1,115 @@
+// REQUIRES: (host || cpu || accelerator) && linux
+// RUN: %clangxx -fsycl -c %s -o %t.o
+// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/libsycl-cmath-fp64.o -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+#include "math_utils.hpp"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <math.h>
+
+namespace s = cl::sycl;
+constexpr s::access::mode sycl_read = s::access::mode::read;
+constexpr s::access::mode sycl_write = s::access::mode::write;
+
+#define TEST_NUM 38
+
+double ref_val[TEST_NUM] = {
+    1, 0, 0, 0, 0, 0, 0, 1, 1, 0.5,
+    0, 2, 0, 0, 1, 0, 2, 0, 0, 0,
+    0, 0, 1, 0, 1, 2, 0, 1, 2, 5,
+    0, 0, 0, 0, 0.5, 0.5, NAN, NAN};
+
+double refIptr = 1;
+
+void device_math_test(s::queue &deviceQueue) {
+  s::range<1> numOfItems{TEST_NUM};
+  double result[TEST_NUM] = {-1};
+
+  // Variable exponent is an integer value to store the exponent in frexp function
+  int exponent = -1;
+
+  // Variable iptr stores the integral part of float point in modf function
+  double iptr = -1;
+
+  // Variable quo stores the sign and some bits of x/y in remquo function
+  int quo = -1;
+  {
+    s::buffer<double, 1> buffer1(result, numOfItems);
+    s::buffer<int, 1> buffer2(&exponent, s::range<1>{1});
+    s::buffer<double, 1> buffer3(&iptr, s::range<1>{1});
+    s::buffer<int, 1> buffer4(&quo, s::range<1>{1});
+    deviceQueue.submit([&](cl::sycl::handler &cgh) {
+      auto res_access = buffer1.template get_access<sycl_write>(cgh);
+      auto exp_access = buffer2.template get_access<sycl_write>(cgh);
+      auto iptr_access = buffer3.template get_access<sycl_write>(cgh);
+      auto quo_access = buffer4.template get_access<sycl_write>(cgh);
+      cgh.single_task<class DeviceMathTest>([=]() {
+        int i = 0;
+        res_access[i++] = cos(0.0);
+        res_access[i++] = sin(0.0);
+        res_access[i++] = log(1.0);
+        res_access[i++] = acos(1.0);
+        res_access[i++] = asin(0.0);
+        res_access[i++] = atan(0.0);
+        res_access[i++] = atan2(0.0, 1.0);
+        res_access[i++] = cosh(0.0);
+        res_access[i++] = exp(0.0);
+        res_access[i++] = fmod(1.5, 1.0);
+        res_access[i++] = frexp(0.0, &exp_access[0]);
+        res_access[i++] = ldexp(1.0, 1);
+        res_access[i++] = log10(1.0);
+        res_access[i++] = modf(1.0, &iptr_access[0]);
+        res_access[i++] = pow(1.0, 1.0);
+        res_access[i++] = sinh(0.0);
+        res_access[i++] = sqrt(4.0);
+        res_access[i++] = tan(0.0);
+        res_access[i++] = tanh(0.0);
+        res_access[i++] = acosh(1.0);
+        res_access[i++] = asinh(0.0);
+        res_access[i++] = atanh(0.0);
+        res_access[i++] = cbrt(1.0);
+        res_access[i++] = erf(0.0);
+        res_access[i++] = erfc(0.0);
+        res_access[i++] = exp2(1.0);
+        res_access[i++] = expm1(0.0);
+        res_access[i++] = fdim(1.0, 0.0);
+        res_access[i++] = fma(1.0, 1.0, 1.0);
+        res_access[i++] = hypot(3.0, 4.0);
+        res_access[i++] = ilogb(1.0);
+        res_access[i++] = log1p(0.0);
+        res_access[i++] = log2(1.0);
+        res_access[i++] = logb(1.0);
+        res_access[i++] = remainder(0.5, 1.0);
+        res_access[i++] = remquo(0.5, 1.0, &quo_access[0]);
+        double a = NAN;
+        res_access[i++] = tgamma(a);
+        res_access[i++] = lgamma(a);
+      });
+    });
+  }
+
+  // Compare result with reference
+  for (int i = 0; i < TEST_NUM; ++i) {
+    assert(approx_equal_fp(result[i], ref_val[i]));
+  }
+
+  // Test modf integral part
+  assert(approx_equal_fp(iptr, refIptr));
+
+  // Test frexp exponent
+  assert(exponent == 0);
+
+  // Test remquo sign
+  assert(quo == 0);
+}
+
+int main() {
+  s::queue deviceQueue;
+  if (deviceQueue.get_device().has_extension("cl_khr_fp64")) {
+    device_math_test(deviceQueue);
+    std::cout << "Pass" << std::endl;
+  }
+  return 0;
+}
diff --git a/SYCL/Basic/devicelib/math_fp64_windows_test.cpp b/SYCL/Basic/devicelib/math_fp64_windows_test.cpp
new file mode 100644
index 0000000000..e7cc317429
--- /dev/null
+++ b/SYCL/Basic/devicelib/math_fp64_windows_test.cpp
@@ -0,0 +1,132 @@
+// REQUIRES: (cpu || host || accelerator) && windows
+// RUN: %clangxx -fsycl -c %s -o %t.o
+// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/../bin/libsycl-cmath-fp64.o -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+#include "math_utils.hpp"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <math.h>
+
+namespace s = cl::sycl;
+constexpr s::access::mode sycl_read = s::access::mode::read;
+constexpr s::access::mode sycl_write = s::access::mode::write;
+
+#define TEST_NUM 41
+
+double ref_val[TEST_NUM] = {
+    1, 0, 0, 0, 0, 0, 0, 1, 1, 0.5,
+    0, 2, 0, 0, 1, 0, 2, 0, 0, 0,
+    0, 0, 1, 0, 1, 2, 0, 1, 2, 5,
+    0, 0, 0, 0, 0.5, 0.5, NAN, NAN, 1, 2, 0};
+
+double refIptr = 1;
+
+void device_math_test(s::queue &deviceQueue) {
+  s::range<1> numOfItems{TEST_NUM};
+  double result[TEST_NUM] = {-1};
+
+  // Variable exponent is an integer value to store the exponent in frexp function
+  int exponent = -1;
+
+  // Variable iptr stores the integral part of float point in modf function
+  double iptr = -1;
+
+  // Variable quo stores the sign and some bits of x/y in remquo function
+  int quo = -1;
+
+  // Varaible enm stores the enum value retured by MSVC function
+  short enm[2] = {10, 10};
+  {
+    s::buffer<double, 1> buffer1(result, numOfItems);
+    s::buffer<int, 1> buffer2(&exponent, s::range<1>{1});
+    s::buffer<double, 1> buffer3(&iptr, s::range<1>{1});
+    s::buffer<int, 1> buffer4(&quo, s::range<1>{1});
+    s::buffer<short, 1> buffer5(enm, s::range<1>{2});
+    deviceQueue.submit([&](cl::sycl::handler &cgh) {
+      auto res_access = buffer1.template get_access<sycl_write>(cgh);
+      auto exp_access = buffer2.template get_access<sycl_write>(cgh);
+      auto iptr_access = buffer3.template get_access<sycl_write>(cgh);
+      auto quo_access = buffer4.template get_access<sycl_write>(cgh);
+      auto enm_access = buffer5.template get_access<sycl_write>(cgh);
+      cgh.single_task<class DeviceMathTest>([=]() {
+        int i = 0;
+        res_access[i++] = cos(0.0);
+        res_access[i++] = sin(0.0);
+        res_access[i++] = log(1.0);
+        res_access[i++] = acos(1.0);
+        res_access[i++] = asin(0.0);
+        res_access[i++] = atan(0.0);
+        res_access[i++] = atan2(0.0, 1.0);
+        res_access[i++] = cosh(0.0);
+        res_access[i++] = exp(0.0);
+        res_access[i++] = fmod(1.5, 1.0);
+        res_access[i++] = frexp(0.0, &exp_access[0]);
+        res_access[i++] = ldexp(1.0, 1);
+        res_access[i++] = log10(1.0);
+        res_access[i++] = modf(1.0, &iptr_access[0]);
+        res_access[i++] = pow(1.0, 1.0);
+        res_access[i++] = sinh(0.0);
+        res_access[i++] = sqrt(4.0);
+        res_access[i++] = tan(0.0);
+        res_access[i++] = tanh(0.0);
+        res_access[i++] = acosh(1.0);
+        res_access[i++] = asinh(0.0);
+        res_access[i++] = atanh(0.0);
+        res_access[i++] = cbrt(1.0);
+        res_access[i++] = erf(0.0);
+        res_access[i++] = erfc(0.0);
+        res_access[i++] = exp2(1.0);
+        res_access[i++] = expm1(0.0);
+        res_access[i++] = fdim(1.0, 0.0);
+        res_access[i++] = fma(1.0, 1.0, 1.0);
+        res_access[i++] = hypot(3.0, 4.0);
+        res_access[i++] = ilogb(1.0);
+        res_access[i++] = log1p(0.0);
+        res_access[i++] = log2(1.0);
+        res_access[i++] = logb(1.0);
+        res_access[i++] = remainder(0.5, 1.0);
+        res_access[i++] = remquo(0.5, 1.0, &quo_access[0]);
+        double a = NAN;
+        res_access[i++] = tgamma(a);
+        res_access[i++] = lgamma(a);
+        enm_access[0] = _Dtest(&a);
+        a = 0.0;
+        enm_access[1] = _Exp(&a, 1.0, 0);
+        res_access[i++] = a;
+        res_access[i++] = _Cosh(0.0, 2.0);
+        res_access[i++] = _Sinh(0.0, 1.0);
+      });
+    });
+  }
+
+  // Compare result with reference
+  for (int i = 0; i < TEST_NUM; ++i) {
+    assert(approx_equal_fp(result[i], ref_val[i]));
+  }
+
+  // Test modf integral part
+  assert(approx_equal_fp(iptr, refIptr));
+
+  // Test frexp exponent
+  assert(exponent == 0);
+
+  // Test remquo sign
+  assert(quo == 0);
+
+  // Test enum value returned by _Dtest
+  assert(enm[0] == _NANCODE);
+
+  // Test enum value returned by _Exp
+  assert(enm[1] == _FINITE);
+}
+
+int main() {
+  s::queue deviceQueue;
+  if (deviceQueue.get_device().has_extension("cl_khr_fp64")) {
+    device_math_test(deviceQueue);
+    std::cout << "Pass" << std::endl;
+  }
+  return 0;
+}
diff --git a/SYCL/Basic/devicelib/math_override_test.cpp b/SYCL/Basic/devicelib/math_override_test.cpp
new file mode 100644
index 0000000000..829fc6360b
--- /dev/null
+++ b/SYCL/Basic/devicelib/math_override_test.cpp
@@ -0,0 +1,49 @@
+// UNSUPPORTED: windows
+// RUN: %clangxx -fsycl -c %s -o %t.o
+// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/libsycl-cmath.o -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// REQUIRES: host
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <math.h>
+
+#include "math_utils.hpp"
+namespace s = cl::sycl;
+constexpr s::access::mode sycl_read = s::access::mode::read;
+constexpr s::access::mode sycl_write = s::access::mode::write;
+
+// Dummy function provided by user to override device library
+// version.
+SYCL_EXTERNAL
+extern "C" float sinf(float x) { return x + 100.f; }
+
+class DeviceTest;
+
+void device_test() {
+  s::queue deviceQueue;
+  s::range<1> numOfItems{1};
+  float result_sin = 0;
+  float result_cos = 0;
+  {
+    s::buffer<float, 1> buffer1(&result_sin, numOfItems);
+    s::buffer<float, 1> buffer2(&result_cos, numOfItems);
+    deviceQueue.submit([&](s::handler &cgh) {
+      auto res_access_sin = buffer1.get_access<sycl_write>(cgh);
+      auto res_access_cos = buffer2.get_access<sycl_write>(cgh);
+      cgh.single_task<class DeviceTest>([=]() {
+        // Should use the sin function defined by user, device
+        // library version should be ignored here
+        res_access_sin[0] = sinf(0.f);
+        res_access_cos[0] = cosf(0.f);
+      });
+    });
+  }
+
+  assert(approx_equal_fp(result_sin, 100.f) && approx_equal_fp(result_cos, 1.f));
+}
+
+int main() {
+  device_test();
+  std::cout << "Pass" << std::endl;
+  return 0;
+}
diff --git a/SYCL/Basic/devicelib/math_test.cpp b/SYCL/Basic/devicelib/math_test.cpp
new file mode 100644
index 0000000000..94ac5cc307
--- /dev/null
+++ b/SYCL/Basic/devicelib/math_test.cpp
@@ -0,0 +1,113 @@
+// REQUIRES: ( host || accelerator || cpu ) && linux
+// RUN: %clangxx -fsycl -c %s -o %t.o
+// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/libsycl-cmath.o -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+#include "math_utils.hpp"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <math.h>
+
+namespace s = cl::sycl;
+constexpr s::access::mode sycl_read = s::access::mode::read;
+constexpr s::access::mode sycl_write = s::access::mode::write;
+
+#define TEST_NUM 38
+
+float ref_val[TEST_NUM] = {
+    1, 0, 0, 0, 0, 0, 0, 1, 1, 0.5,
+    0, 2, 0, 0, 1, 0, 2, 0, 0, 0,
+    0, 0, 1, 0, 1, 2, 0, 1, 2, 5,
+    0, 0, 0, 0, 0.5, 0.5, NAN, NAN};
+
+float refIptr = 1;
+
+void device_math_test(s::queue &deviceQueue) {
+  s::range<1> numOfItems{TEST_NUM};
+  float result[TEST_NUM] = {-1};
+
+  // Variable exponent is an integer value to store the exponent in frexp function
+  int exponent = -1;
+
+  // Variable iptr stores the integral part of float point in modf function
+  float iptr = -1;
+
+  // Variable quo stores the sign and some bits of x/y in remquo function
+  int quo = -1;
+  {
+    s::buffer<float, 1> buffer1(result, numOfItems);
+    s::buffer<int, 1> buffer2(&exponent, s::range<1>{1});
+    s::buffer<float, 1> buffer3(&iptr, s::range<1>{1});
+    s::buffer<int, 1> buffer4(&quo, s::range<1>{1});
+    deviceQueue.submit([&](cl::sycl::handler &cgh) {
+      auto res_access = buffer1.template get_access<sycl_write>(cgh);
+      auto exp_access = buffer2.template get_access<sycl_write>(cgh);
+      auto iptr_access = buffer3.template get_access<sycl_write>(cgh);
+      auto quo_access = buffer4.template get_access<sycl_write>(cgh);
+      cgh.single_task<class DeviceMathTest>([=]() {
+        int i = 0;
+        res_access[i++] = cosf(0.0f);
+        res_access[i++] = sinf(0.0f);
+        res_access[i++] = logf(1.0f);
+        res_access[i++] = acosf(1.0f);
+        res_access[i++] = asinf(0.0f);
+        res_access[i++] = atanf(0.0f);
+        res_access[i++] = atan2f(0.0f, 1.0f);
+        res_access[i++] = coshf(0.0f);
+        res_access[i++] = expf(0.0f);
+        res_access[i++] = fmodf(1.5f, 1.0f);
+        res_access[i++] = frexpf(0.0f, &exp_access[0]);
+        res_access[i++] = ldexpf(1.0f, 1);
+        res_access[i++] = log10f(1.0f);
+        res_access[i++] = modff(1.0f, &iptr_access[0]);
+        res_access[i++] = powf(1.0f, 1.0f);
+        res_access[i++] = sinhf(0.0f);
+        res_access[i++] = sqrtf(4.0f);
+        res_access[i++] = tanf(0.0f);
+        res_access[i++] = tanhf(0.0f);
+        res_access[i++] = acoshf(1.0f);
+        res_access[i++] = asinhf(0.0f);
+        res_access[i++] = atanhf(0.0f);
+        res_access[i++] = cbrtf(1.0f);
+        res_access[i++] = erff(0.0f);
+        res_access[i++] = erfcf(0.0f);
+        res_access[i++] = exp2f(1.0f);
+        res_access[i++] = expm1f(0.0f);
+        res_access[i++] = fdimf(1.0f, 0.0f);
+        res_access[i++] = fmaf(1.0f, 1.0f, 1.0f);
+        res_access[i++] = hypotf(3.0f, 4.0f);
+        res_access[i++] = ilogbf(1.0f);
+        res_access[i++] = log1pf(0.0f);
+        res_access[i++] = log2f(1.0f);
+        res_access[i++] = logbf(1.0f);
+        res_access[i++] = remainderf(0.5f, 1.0f);
+        res_access[i++] = remquof(0.5f, 1.0f, &quo_access[0]);
+        float a = NAN;
+        res_access[i++] = tgammaf(a);
+        res_access[i++] = lgammaf(a);
+      });
+    });
+  }
+
+  // Compare result with reference
+  for (int i = 0; i < TEST_NUM; ++i) {
+    assert(approx_equal_fp(result[i], ref_val[i]));
+  }
+
+  // Test modf integral part
+  assert(approx_equal_fp(iptr, refIptr));
+
+  // Test frexp exponent
+  assert(exponent == 0);
+
+  // Test remquo sign
+  assert(quo == 0);
+}
+
+int main() {
+  s::queue deviceQueue;
+  device_math_test(deviceQueue);
+  std::cout << "Pass" << std::endl;
+  return 0;
+}
diff --git a/SYCL/Basic/devicelib/math_utils.hpp b/SYCL/Basic/devicelib/math_utils.hpp
new file mode 100644
index 0000000000..eb4f5cae07
--- /dev/null
+++ b/SYCL/Basic/devicelib/math_utils.hpp
@@ -0,0 +1,29 @@
+#ifndef MATH_UTILS
+#include <cmath>
+#include <limits>
+
+// Since it is not proper to compare float point using operator ==, this
+// function measures whether the result of cmath function from kernel is
+// close to the reference and machine epsilon is used as threshold in this
+// function. T must be float-point type.
+template <typename T>
+bool approx_equal_fp(T x, T y) {
+
+  // At least one input is nan
+  if (std::isnan(x) || std::isnan(y))
+    return std::isnan(x) && std::isnan(y);
+
+  // At least one input is inf
+  if (std::isinf(x) || std::isinf(y))
+    return (x == y);
+
+  // two finite
+  T threshold = std::numeric_limits<T>::epsilon() * 100;
+  if (x != 0 && y != 0) {
+    T max_v = std::fmax(std::abs(x), std::abs(y));
+    return std::abs(x - y) < threshold * max_v;
+  }
+  return x != 0 ? std::abs(x) < threshold : std::abs(y) < threshold;
+}
+
+#endif
diff --git a/SYCL/Basic/devicelib/math_windows_test.cpp b/SYCL/Basic/devicelib/math_windows_test.cpp
new file mode 100644
index 0000000000..bce34de651
--- /dev/null
+++ b/SYCL/Basic/devicelib/math_windows_test.cpp
@@ -0,0 +1,121 @@
+// REQUIRES: (accelerator || cpu || host) && windows
+// RUN: %clangxx -fsycl -c %s -o %t.o
+// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/../bin/libsycl-cmath.o -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+#include "math_utils.hpp"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <math.h>
+
+namespace s = cl::sycl;
+constexpr s::access::mode sycl_read = s::access::mode::read;
+constexpr s::access::mode sycl_write = s::access::mode::write;
+
+#define TEST_NUM 39
+
+float ref_val[TEST_NUM] = {
+    1, 0, 0, 0, 0, 0, 0, 1, 1, 0.5,
+    0, 0, 1, 0, 2, 0, 0, 0, 0, 0,
+    1, 0, 1, 2, 0, 1, 2, 5, 0, 0,
+    0, 0, 0.5, 0.5, NAN, NAN, 1, 2, 0};
+
+float refIptr = 1;
+
+void device_math_test(s::queue &deviceQueue) {
+  s::range<1> numOfItems{TEST_NUM};
+  float result[TEST_NUM] = {-1};
+
+  // Variable iptr stores the integral part of float point in modf function
+  float iptr = -1;
+
+  // Variable quo stores the sign and some bits of x/y in remquo function
+  int quo = -1;
+
+  // Varaible enm stores the enum value retured by MSVC function
+  short enm[2] = {10, 10};
+
+  {
+    s::buffer<float, 1> buffer1(result, numOfItems);
+    s::buffer<float, 1> buffer2(&iptr, s::range<1>{1});
+    s::buffer<int, 1> buffer3(&quo, s::range<1>{1});
+    s::buffer<short, 1> buffer4(enm, s::range<1>{2});
+    deviceQueue.submit([&](cl::sycl::handler &cgh) {
+      auto res_access = buffer1.template get_access<sycl_write>(cgh);
+      auto iptr_access = buffer2.template get_access<sycl_write>(cgh);
+      auto quo_access = buffer3.template get_access<sycl_write>(cgh);
+      auto enm_access = buffer4.template get_access<sycl_write>(cgh);
+      cgh.single_task<class DeviceMathTest>([=]() {
+        int i = 0;
+        res_access[i++] = cosf(0.0f);
+        res_access[i++] = sinf(0.0f);
+        res_access[i++] = logf(1.0f);
+        res_access[i++] = acosf(1.0f);
+        res_access[i++] = asinf(0.0f);
+        res_access[i++] = atanf(0.0f);
+        res_access[i++] = atan2f(0.0f, 1.0f);
+        res_access[i++] = coshf(0.0f);
+        res_access[i++] = expf(0.0f);
+        res_access[i++] = fmodf(1.5f, 1.0f);
+        res_access[i++] = log10f(1.0f);
+        res_access[i++] = modff(1.0f, &iptr_access[0]);
+        res_access[i++] = powf(1.0f, 1.0f);
+        res_access[i++] = sinhf(0.0f);
+        res_access[i++] = sqrtf(4.0f);
+        res_access[i++] = tanf(0.0f);
+        res_access[i++] = tanhf(0.0f);
+        res_access[i++] = acoshf(1.0f);
+        res_access[i++] = asinhf(0.0f);
+        res_access[i++] = atanhf(0.0f);
+        res_access[i++] = cbrtf(1.0f);
+        res_access[i++] = erff(0.0f);
+        res_access[i++] = erfcf(0.0f);
+        res_access[i++] = exp2f(1.0f);
+        res_access[i++] = expm1f(0.0f);
+        res_access[i++] = fdimf(1.0f, 0.0f);
+        res_access[i++] = fmaf(1.0f, 1.0f, 1.0f);
+        res_access[i++] = hypotf(3.0f, 4.0f);
+        res_access[i++] = ilogbf(1.0f);
+        res_access[i++] = log1pf(0.0f);
+        res_access[i++] = log2f(1.0f);
+        res_access[i++] = logbf(1.0f);
+        res_access[i++] = remainderf(0.5f, 1.0f);
+        res_access[i++] = remquof(0.5f, 1.0f, &quo_access[0]);
+        float a = NAN;
+        res_access[i++] = tgammaf(a);
+        res_access[i++] = lgammaf(a);
+        enm_access[0] = _FDtest(&a);
+        a = 0.0f;
+        enm_access[1] = _FExp(&a, 1.0f, 0);
+        res_access[i++] = a;
+        res_access[i++] = _FCosh(0.0f, 2.0f);
+        res_access[i++] = _FSinh(0.0f, 1.0f);
+      });
+    });
+  }
+
+  // Compare result with reference
+  for (int i = 0; i < TEST_NUM; ++i) {
+    assert(approx_equal_fp(result[i], ref_val[i]));
+  }
+
+  // Test modf integral part
+  assert(approx_equal_fp(iptr, refIptr));
+
+  // Test remquo sign
+  assert(quo == 0);
+
+  // Test enum value returned by _FDtest
+  assert(enm[0] == _NANCODE);
+
+  // Test enum value returned by _FExp
+  assert(enm[1] == _FINITE);
+}
+
+int main() {
+  s::queue deviceQueue;
+  device_math_test(deviceQueue);
+  std::cout << "Pass" << std::endl;
+  return 0;
+}
diff --git a/SYCL/Basic/devicelib/std_complex_math_fp64_test.cpp b/SYCL/Basic/devicelib/std_complex_math_fp64_test.cpp
new file mode 100644
index 0000000000..e94bf950e8
--- /dev/null
+++ b/SYCL/Basic/devicelib/std_complex_math_fp64_test.cpp
@@ -0,0 +1,206 @@
+// UNSUPPORTED: windows
+// RUN: %clangxx -fsycl -c %s -o %t.o
+// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/libsycl-complex-fp64.o %sycl_libs_dir/libsycl-cmath-fp64.o -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// XFAIL:cpu
+// REQUIRES: host, cpu, accelerator
+
+#include <CL/sycl.hpp>
+#include <array>
+#include <cassert>
+#include <complex>
+
+#include "math_utils.hpp"
+
+using std::complex;
+namespace s = cl::sycl;
+constexpr s::access::mode sycl_read = s::access::mode::read;
+constexpr s::access::mode sycl_write = s::access::mode::write;
+
+template <typename T>
+bool approx_equal_cmplx(complex<T> x, complex<T> y) {
+  return approx_equal_fp(x.real(), y.real()) &&
+         approx_equal_fp(x.imag(), y.imag());
+}
+
+static constexpr auto TestArraySize1 = 57;
+static constexpr auto TestArraySize2 = 10;
+
+std::array<complex<double>, TestArraySize1> ref1_results = {
+    complex<double>(-1., 1.),
+    complex<double>(1., 3.),
+    complex<double>(-2., 10.),
+    complex<double>(-8., 31.),
+    complex<double>(1., 1.),
+    complex<double>(2., 1.),
+    complex<double>(2., 2.),
+    complex<double>(3., 4.),
+    complex<double>(2., 1.),
+    complex<double>(0., 1.),
+    complex<double>(2., 0.),
+    complex<double>(0., 0.),
+    complex<double>(0., 1.),
+    complex<double>(1., 1.),
+    complex<double>(2., 0.),
+    complex<double>(2., 3.),
+    complex<double>(1., 0.),
+    complex<double>(0., 1.),
+    complex<double>(-1., 0.),
+    complex<double>(0., M_E),
+    complex<double>(0., 0.),
+    complex<double>(0., M_PI_2),
+    complex<double>(0., M_PI),
+    complex<double>(1., M_PI_2),
+    complex<double>(0., 0.),
+    complex<double>(1., 0.),
+    complex<double>(1., 0.),
+    complex<double>(-1., 0.),
+    complex<double>(-INFINITY, 0.),
+    complex<double>(1., 0.),
+    complex<double>(10., 0.),
+    complex<double>(100., 0.),
+    complex<double>(200., 0.),
+    complex<double>(1., 2.),
+    complex<double>(INFINITY, 0.),
+    complex<double>(INFINITY, 0.),
+    complex<double>(0., 1.),
+    complex<double>(M_PI_2, 0.),
+    complex<double>(0., 0.),
+    complex<double>(1., 0.),
+    complex<double>(INFINITY, 0.),
+    complex<double>(0., 0.),
+    complex<double>(1., 0.),
+    complex<double>(0., 0.),
+    complex<double>(INFINITY, M_PI_2),
+    complex<double>(INFINITY, 0.),
+    complex<double>(0., M_PI_2),
+    complex<double>(INFINITY, M_PI_2),
+    complex<double>(INFINITY, 0.),
+    complex<double>(0., 0.),
+    complex<double>(0., M_PI_2),
+
+    complex<double>(1., -4.),
+    complex<double>(18., -7.),
+    complex<double>(1.557407724654902, 0.),
+    complex<double>(0, 0.761594155955765),
+    complex<double>(M_PI_2, 0.),
+    complex<double>(M_PI_2, 0.549306144334055)};
+
+std::array<double, TestArraySize2> ref2_results = {0., 25., 169., INFINITY, 0.,
+                                                   5., 13., INFINITY, 0., M_PI_2};
+
+void device_complex_test(s::queue &deviceQueue) {
+  s::range<1> numOfItems1{TestArraySize1};
+  s::range<1> numOfItems2{TestArraySize2};
+  std::array<complex<double>, TestArraySize1> result1;
+  std::array<double, TestArraySize2> result2;
+  {
+    s::buffer<complex<double>, 1> buffer1(result1.data(), numOfItems1);
+    s::buffer<double, 1> buffer2(result2.data(), numOfItems2);
+    deviceQueue.submit([&](s::handler &cgh) {
+      auto buf_out1_access = buffer1.get_access<sycl_write>(cgh);
+      auto buf_out2_access = buffer2.get_access<sycl_write>(cgh);
+      cgh.single_task<class DeviceComplexTest>([=]() {
+        int index = 0;
+        buf_out1_access[index++] =
+            complex<double>(0., 1.) * complex<double>(1., 1.);
+        buf_out1_access[index++] =
+            complex<double>(1., 1.) * complex<double>(2., 1.);
+        buf_out1_access[index++] =
+            complex<double>(2., 3.) * complex<double>(2., 2.);
+        buf_out1_access[index++] =
+            complex<double>(4., 5.) * complex<double>(3., 4.);
+        buf_out1_access[index++] =
+            complex<double>(-1., 1.) / complex<double>(0., 1.);
+        buf_out1_access[index++] =
+            complex<double>(1., 3.) / complex<double>(1., 1.);
+        buf_out1_access[index++] =
+            complex<double>(-2., 10.) / complex<double>(2., 3.);
+        buf_out1_access[index++] =
+            complex<double>(-8., 31.) / complex<double>(4., 5.);
+        buf_out1_access[index++] =
+            complex<double>(4., 2.) / complex<double>(2., 0.);
+        buf_out1_access[index++] =
+            complex<double>(-1., 0.) / complex<double>(0., 1.);
+        buf_out1_access[index++] =
+            complex<double>(0., 10.) / complex<double>(0., 5.);
+        buf_out1_access[index++] =
+            complex<double>(0., 0.) / complex<double>(1., 0.);
+        buf_out1_access[index++] = std::sqrt(complex<double>(-1., 0.));
+        buf_out1_access[index++] = std::sqrt(complex<double>(0., 2.));
+        buf_out1_access[index++] = std::sqrt(complex<double>(4., 0.));
+        buf_out1_access[index++] = std::sqrt(complex<double>(-5., 12.));
+        buf_out1_access[index++] = std::exp(complex<double>(0., 0.));
+        buf_out1_access[index++] = std::exp(complex<double>(0., M_PI_2));
+        buf_out1_access[index++] = std::exp(complex<double>(0., M_PI));
+        buf_out1_access[index++] = std::exp(complex<double>(1., M_PI_2));
+        buf_out1_access[index++] = std::log(complex<double>(1., 0.));
+        buf_out1_access[index++] = std::log(complex<double>(0., 1.));
+        buf_out1_access[index++] = std::log(complex<double>(-1., 0.));
+        buf_out1_access[index++] = std::log(complex<double>(0., M_E));
+        buf_out1_access[index++] = std::sin(complex<double>(0., 0.));
+        buf_out1_access[index++] = std::sin(complex<double>(M_PI_2, 0.));
+        buf_out1_access[index++] = std::cos(complex<double>(0., 0.));
+        buf_out1_access[index++] = std::cos(complex<double>(M_PI, 0.));
+        buf_out1_access[index++] = std::log10(complex<double>(0., 0.));
+        buf_out1_access[index++] = std::polar(1.);
+        buf_out1_access[index++] = std::polar(10., 0.);
+        buf_out1_access[index++] = std::polar(100.);
+        buf_out1_access[index++] = std::polar(200., 0.);
+        buf_out1_access[index++] = std::proj(complex<double>(1., 2.));
+        buf_out1_access[index++] = std::proj(complex<double>(INFINITY, -1.));
+        buf_out1_access[index++] = std::proj(complex<double>(0., -INFINITY));
+        buf_out1_access[index++] = std::pow(complex<double>(-1., 0.), 0.5);
+        buf_out1_access[index++] = std::acos(complex<double>(0., 0.));
+        buf_out1_access[index++] = std::sinh(complex<double>(0., 0.));
+        buf_out1_access[index++] = std::cosh(complex<double>(0., 0.));
+        buf_out1_access[index++] = std::cosh(complex<double>(INFINITY, 0.));
+        buf_out1_access[index++] = std::tanh(complex<double>(0., 0.));
+        buf_out1_access[index++] = std::tanh(complex<double>(INFINITY, 1.));
+        buf_out1_access[index++] = std::asinh(complex<double>(0., 0.));
+        buf_out1_access[index++] = std::asinh(complex<double>(1., INFINITY));
+        buf_out1_access[index++] = std::asinh(complex<double>(INFINITY, 1.));
+        buf_out1_access[index++] = std::acosh(complex<double>(0., 0.));
+        buf_out1_access[index++] = std::acosh(complex<double>(1., INFINITY));
+        buf_out1_access[index++] = std::acosh(complex<double>(INFINITY, 1.));
+        buf_out1_access[index++] = std::atanh(complex<double>(0., 0.));
+        buf_out1_access[index++] = std::atanh(complex<double>(1., INFINITY));
+        buf_out1_access[index++] = std::conj(complex<double>(1., 4.));
+        buf_out1_access[index++] = std::conj(complex<double>(18., 7.));
+        buf_out1_access[index++] = std::tan(complex<double>(1., 0.));
+        buf_out1_access[index++] = std::tan(complex<double>(0., 1.));
+        buf_out1_access[index++] = std::asin(complex<double>(1., 0.));
+        buf_out1_access[index++] = std::atan(complex<double>(0., 2.));
+
+        index = 0;
+        buf_out2_access[index++] = std::norm(complex<double>(0., 0.));
+        buf_out2_access[index++] = std::norm(complex<double>(3., 4.));
+        buf_out2_access[index++] = std::norm(complex<double>(12., 5.));
+        buf_out2_access[index++] = std::norm(complex<double>(INFINITY, 1.));
+        buf_out2_access[index++] = std::abs(complex<double>(0., 0.));
+        buf_out2_access[index++] = std::abs(complex<double>(3., 4.));
+        buf_out2_access[index++] = std::abs(complex<double>(12., 5.));
+        buf_out2_access[index++] = std::abs(complex<double>(INFINITY, 1.));
+        buf_out2_access[index++] = std::arg(complex<double>(1., 0.));
+        buf_out2_access[index++] = std::arg(complex<double>(0., 1.));
+      });
+    });
+  }
+
+  for (size_t idx = 0; idx < TestArraySize1; ++idx) {
+    assert(approx_equal_cmplx(result1[idx], ref1_results[idx]));
+  }
+  for (size_t idx = 0; idx < TestArraySize2; ++idx) {
+    assert(approx_equal_fp(result2[idx], ref2_results[idx]));
+  }
+}
+
+int main() {
+  s::queue deviceQueue;
+  if (deviceQueue.get_device().has_extension("cl_khr_fp64")) {
+    device_complex_test(deviceQueue);
+    std::cout << "Pass" << std::endl;
+  }
+}
diff --git a/SYCL/Basic/devicelib/std_complex_math_test.cpp b/SYCL/Basic/devicelib/std_complex_math_test.cpp
new file mode 100644
index 0000000000..c8e585ef6c
--- /dev/null
+++ b/SYCL/Basic/devicelib/std_complex_math_test.cpp
@@ -0,0 +1,204 @@
+// UNSUPPORTED: windows
+// RUN: %clangxx -fsycl -c %s -o %t.o
+// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/libsycl-complex.o %sycl_libs_dir/libsycl-cmath.o -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// XFAIL:cpu
+// REQUIRES: host, cpu, accelerator
+
+#include <CL/sycl.hpp>
+#include <array>
+#include <cassert>
+#include <complex>
+
+#include "math_utils.hpp"
+
+using std::complex;
+namespace s = cl::sycl;
+constexpr s::access::mode sycl_read = s::access::mode::read;
+constexpr s::access::mode sycl_write = s::access::mode::write;
+
+template <typename T>
+bool approx_equal_cmplx(complex<T> x, complex<T> y) {
+  return approx_equal_fp(x.real(), y.real()) &&
+         approx_equal_fp(x.imag(), y.imag());
+}
+
+static constexpr auto TestArraySize1 = 57;
+static constexpr auto TestArraySize2 = 10;
+
+std::array<complex<float>, TestArraySize1> ref1_results = {
+    complex<float>(-1.f, 1.f),
+    complex<float>(1.f, 3.f),
+    complex<float>(-2.f, 10.f),
+    complex<float>(-8.f, 31.f),
+    complex<float>(1.f, 1.f),
+    complex<float>(2.f, 1.f),
+    complex<float>(2.f, 2.f),
+    complex<float>(3.f, 4.f),
+    complex<float>(2.f, 1.f),
+    complex<float>(0.f, 1.f),
+    complex<float>(2.f, 0.f),
+    complex<float>(0.f, 0.f),
+    complex<float>(0.f, 1.f),
+    complex<float>(1.f, 1.f),
+    complex<float>(2.f, 0.f),
+    complex<float>(2.f, 3.f),
+    complex<float>(1.f, 0.f),
+    complex<float>(0.f, 1.f),
+    complex<float>(-1.f, 0.f),
+    complex<float>(0.f, M_E),
+    complex<float>(0.f, 0.f),
+    complex<float>(0.f, M_PI_2),
+    complex<float>(0.f, M_PI),
+    complex<float>(1.f, M_PI_2),
+    complex<float>(0.f, 0.f),
+    complex<float>(1.f, 0.f),
+    complex<float>(1.f, 0.f),
+    complex<float>(-1.f, 0.f),
+    complex<float>(-INFINITY, 0.f),
+    complex<float>(1.f, 0.f),
+    complex<float>(10.f, 0.f),
+    complex<float>(100.f, 0.f),
+    complex<float>(200.f, 0.f),
+    complex<float>(1.f, 2.f),
+    complex<float>(INFINITY, 0.f),
+    complex<float>(INFINITY, 0.f),
+    complex<float>(0.f, 1.f),
+    complex<float>(M_PI_2, 0.f),
+    complex<float>(0.f, 0.f),
+    complex<float>(1.f, 0.f),
+    complex<float>(INFINITY, 0.f),
+    complex<float>(0.f, 0.f),
+    complex<float>(1.f, 0.f),
+    complex<float>(0.f, 0.f),
+    complex<float>(INFINITY, M_PI_2),
+    complex<float>(INFINITY, 0.f),
+    complex<float>(0.f, M_PI_2),
+    complex<float>(INFINITY, M_PI_2),
+    complex<float>(INFINITY, 0.f),
+    complex<float>(0.f, 0.f),
+    complex<float>(0.f, M_PI_2),
+
+    complex<float>(1.f, -4.f),
+    complex<float>(18.f, -7.f),
+    complex<float>(1.557408f, 0.f),
+    complex<float>(0.f, 0.761594f),
+    complex<float>(M_PI_2, 0.f),
+    complex<float>(M_PI_2, 0.549306f)};
+
+std::array<float, TestArraySize2> ref2_results = {0.f, 25.f, 169.f, INFINITY, 0.f,
+                                                  5.f, 13.f, INFINITY, 0.f, M_PI_2};
+
+void device_complex_test(s::queue &deviceQueue) {
+  s::range<1> numOfItems1{TestArraySize1};
+  s::range<1> numOfItems2{TestArraySize2};
+  std::array<complex<float>, TestArraySize1> result1;
+  std::array<float, TestArraySize2> result2;
+  {
+    s::buffer<complex<float>, 1> buffer1(result1.data(), numOfItems1);
+    s::buffer<float, 1> buffer2(result2.data(), numOfItems2);
+    deviceQueue.submit([&](s::handler &cgh) {
+      auto buf_out1_access = buffer1.get_access<sycl_write>(cgh);
+      auto buf_out2_access = buffer2.get_access<sycl_write>(cgh);
+      cgh.single_task<class DeviceComplexTest>([=]() {
+        int index = 0;
+        buf_out1_access[index++] =
+            complex<float>(0.f, 1.f) * complex<float>(1.f, 1.f);
+        buf_out1_access[index++] =
+            complex<float>(1.f, 1.f) * complex<float>(2.f, 1.f);
+        buf_out1_access[index++] =
+            complex<float>(2.f, 3.f) * complex<float>(2.f, 2.f);
+        buf_out1_access[index++] =
+            complex<float>(4.f, 5.f) * complex<float>(3.f, 4.f);
+        buf_out1_access[index++] =
+            complex<float>(-1.f, 1.f) / complex<float>(0.f, 1.f);
+        buf_out1_access[index++] =
+            complex<float>(1.f, 3.f) / complex<float>(1.f, 1.f);
+        buf_out1_access[index++] =
+            complex<float>(-2.f, 10.f) / complex<float>(2.f, 3.f);
+        buf_out1_access[index++] =
+            complex<float>(-8.f, 31.f) / complex<float>(4.f, 5.f);
+        buf_out1_access[index++] =
+            complex<float>(4.f, 2.f) / complex<float>(2.f, 0.f);
+        buf_out1_access[index++] =
+            complex<float>(-1.f, 0.f) / complex<float>(0.f, 1.f);
+        buf_out1_access[index++] =
+            complex<float>(0.f, 10.f) / complex<float>(0.f, 5.f);
+        buf_out1_access[index++] =
+            complex<float>(0.f, 0.f) / complex<float>(1.f, 0.f);
+        buf_out1_access[index++] = std::sqrt(complex<float>(-1.f, 0.f));
+        buf_out1_access[index++] = std::sqrt(complex<float>(0.f, 2.f));
+        buf_out1_access[index++] = std::sqrt(complex<float>(4.f, 0.f));
+        buf_out1_access[index++] = std::sqrt(complex<float>(-5.f, 12.f));
+        buf_out1_access[index++] = std::exp(complex<float>(0.f, 0.f));
+        buf_out1_access[index++] = std::exp(complex<float>(0.f, M_PI_2));
+        buf_out1_access[index++] = std::exp(complex<float>(0.f, M_PI));
+        buf_out1_access[index++] = std::exp(complex<float>(1.f, M_PI_2));
+        buf_out1_access[index++] = std::log(complex<float>(1.f, 0.f));
+        buf_out1_access[index++] = std::log(complex<float>(0.f, 1.f));
+        buf_out1_access[index++] = std::log(complex<float>(-1.f, 0.f));
+        buf_out1_access[index++] = std::log(complex<float>(0.f, M_E));
+        buf_out1_access[index++] = std::sin(complex<float>(0.f, 0.f));
+        buf_out1_access[index++] = std::sin(complex<float>(M_PI_2, 0.f));
+        buf_out1_access[index++] = std::cos(complex<float>(0.f, 0.f));
+        buf_out1_access[index++] = std::cos(complex<float>(M_PI, 0.f));
+        buf_out1_access[index++] = std::log10(complex<float>(0.f, 0.f));
+        buf_out1_access[index++] = std::polar(1.f);
+        buf_out1_access[index++] = std::polar(10.f, 0.f);
+        buf_out1_access[index++] = std::polar(100.f);
+        buf_out1_access[index++] = std::polar(200.f, 0.f);
+        buf_out1_access[index++] = std::proj(complex<float>(1.f, 2.f));
+        buf_out1_access[index++] = std::proj(complex<float>(INFINITY, -1.f));
+        buf_out1_access[index++] = std::proj(complex<float>(0.f, -INFINITY));
+        buf_out1_access[index++] = std::pow(complex<float>(-1.f, 0.f), 0.5f);
+        buf_out1_access[index++] = std::acos(complex<float>(0.f, 0.f));
+        buf_out1_access[index++] = std::sinh(complex<float>(0.f, 0.f));
+        buf_out1_access[index++] = std::cosh(complex<float>(0.f, 0.f));
+        buf_out1_access[index++] = std::cosh(complex<float>(INFINITY, 0.f));
+        buf_out1_access[index++] = std::tanh(complex<float>(0.f, 0.f));
+        buf_out1_access[index++] = std::tanh(complex<float>(INFINITY, 1.f));
+        buf_out1_access[index++] = std::asinh(complex<float>(0.f, 0.f));
+        buf_out1_access[index++] = std::asinh(complex<float>(1.f, INFINITY));
+        buf_out1_access[index++] = std::asinh(complex<float>(INFINITY, 1.f));
+        buf_out1_access[index++] = std::acosh(complex<float>(0.f, 0.f));
+        buf_out1_access[index++] = std::acosh(complex<float>(1.f, INFINITY));
+        buf_out1_access[index++] = std::acosh(complex<float>(INFINITY, 1.f));
+        buf_out1_access[index++] = std::atanh(complex<float>(0.f, 0.f));
+        buf_out1_access[index++] = std::atanh(complex<float>(1.f, INFINITY));
+        buf_out1_access[index++] = std::conj(complex<float>(1.f, 4.f));
+        buf_out1_access[index++] = std::conj(complex<float>(18.f, 7.f));
+        buf_out1_access[index++] = std::tan(complex<float>(1.f, 0.f));
+        buf_out1_access[index++] = std::tan(complex<float>(0.f, 1.f));
+        buf_out1_access[index++] = std::asin(complex<float>(1.f, 0.f));
+        buf_out1_access[index++] = std::atan(complex<float>(0.f, 2.f));
+
+        index = 0;
+        buf_out2_access[index++] = std::norm(complex<float>(0.f, 0.f));
+        buf_out2_access[index++] = std::norm(complex<float>(3.f, 4.f));
+        buf_out2_access[index++] = std::norm(complex<float>(12.f, 5.f));
+        buf_out2_access[index++] = std::norm(complex<float>(INFINITY, 1.f));
+        buf_out2_access[index++] = std::abs(complex<float>(0.f, 0.f));
+        buf_out2_access[index++] = std::abs(complex<float>(3.f, 4.f));
+        buf_out2_access[index++] = std::abs(complex<float>(12.f, 5.f));
+        buf_out2_access[index++] = std::abs(complex<float>(INFINITY, 1.f));
+        buf_out2_access[index++] = std::arg(complex<float>(1.f, 0.f));
+        buf_out2_access[index++] = std::arg(complex<float>(0.f, 1.f));
+      });
+    });
+  }
+
+  for (size_t idx = 0; idx < TestArraySize1; ++idx) {
+    assert(approx_equal_cmplx(result1[idx], ref1_results[idx]));
+  }
+  for (size_t idx = 0; idx < TestArraySize2; ++idx) {
+    assert(approx_equal_fp(result2[idx], ref2_results[idx]));
+  }
+}
+
+int main() {
+  s::queue deviceQueue;
+  device_complex_test(deviceQueue);
+  std::cout << "Pass" << std::endl;
+}
diff --git a/SYCL/Basic/enqueue_barrier/enqueue_barrier.cpp b/SYCL/Basic/enqueue_barrier/enqueue_barrier.cpp
new file mode 100644
index 0000000000..5a1f72ca46
--- /dev/null
+++ b/SYCL/Basic/enqueue_barrier/enqueue_barrier.cpp
@@ -0,0 +1,78 @@
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: env SYCL_PI_TRACE=2 %CPU_RUN_PLACEHOLDER %t.out 2>&1 %CPU_CHECK_PLACEHOLDER
+// RUN: env SYCL_PI_TRACE=2 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER
+// RUN: env SYCL_PI_TRACE=2 %ACC_RUN_PLACEHOLDER %t.out 2>&1 %ACC_CHECK_PLACEHOLDER
+// REQUIRES: cpu, gpu, accelerator
+// UNSUPPORTED: cuda
+
+#include <CL/sycl.hpp>
+#include <CL/sycl/intel/fpga_device_selector.hpp>
+
+int main() {
+  sycl::context Context;
+  sycl::queue Q1(Context, sycl::default_selector{});
+
+  Q1.submit([&](sycl::handler &cgh) {
+    cgh.single_task<class kernel1>([]() {});
+  });
+  Q1.submit([&](sycl::handler &cgh) {
+    cgh.single_task<class kernel2>([]() {});
+  });
+
+  // call handler::barrier()
+  Q1.submit([&](sycl::handler &cgh) {
+    cgh.barrier();
+  });
+
+  Q1.submit([&](sycl::handler &cgh) {
+    cgh.single_task<class kernel3>([]() {});
+  });
+  Q1.submit([&](sycl::handler &cgh) {
+    cgh.single_task<class kernel4>([]() {});
+  });
+
+  // call queue::submit_barrier()
+  Q1.submit_barrier();
+
+  sycl::queue Q2(Context, sycl::default_selector{});
+  sycl::queue Q3(Context, sycl::default_selector{});
+
+  auto Event1 = Q1.submit([&](sycl::handler &cgh) {
+    cgh.single_task<class kernel5>([]() {});
+  });
+
+  auto Event2 = Q2.submit([&](sycl::handler &cgh) {
+    cgh.single_task<class kernel6>([]() {});
+  });
+
+  // call handler::barrier(const vector_class<event> &WaitList)
+  Q3.submit([&](cl::sycl::handler &cgh) {
+    cgh.barrier({Event1, Event2});
+  });
+
+  Q3.submit([&](sycl::handler &cgh) {
+    cgh.single_task<class kernel7>([]() {});
+  });
+
+  auto Event3 = Q1.submit([&](sycl::handler &cgh) {
+    cgh.single_task<class kernel8>([]() {});
+  });
+
+  auto Event4 = Q2.submit([&](sycl::handler &cgh) {
+    cgh.single_task<class kernel9>([]() {});
+  });
+
+  // call queue::submit_barrier(const vector_class<event> &WaitList)
+  Q3.submit_barrier({Event3, Event4});
+
+  Q3.submit([&](sycl::handler &cgh) {
+    cgh.single_task<class kernel10>([]() {});
+  });
+
+  return 0;
+}
+
+// CHECK:---> piEnqueueEventsWaitWithBarrier
+// CHECK:---> piEnqueueEventsWaitWithBarrier
+// CHECK:---> piEnqueueEventsWaitWithBarrier
+// CHECK:---> piEnqueueEventsWaitWithBarrier
diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_16_empty.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_16_empty.cpp
new file mode 100644
index 0000000000..ad4285e8ec
--- /dev/null
+++ b/SYCL/Basic/feature-tests/inline-asm/asm_16_empty.cpp
@@ -0,0 +1,40 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
+          C[wiID] = 43;
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm volatile("");
+#endif
+        });
+  }
+};
+
+int main() {
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  if (verify_all_the_same(f.getOutputBufferData(), 43))
+    return 0;
+
+  return 1;
+}
diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_16_matrix_mult.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_16_matrix_mult.cpp
new file mode 100644
index 0000000000..6ae1debb67
--- /dev/null
+++ b/SYCL/Basic/feature-tests/inline-asm/asm_16_matrix_mult.cpp
@@ -0,0 +1,44 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
+          volatile int output = 0;
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d"
+                       : "=rw"(output));
+#else
+          output = 7;
+#endif
+          C[wiID] = output;
+        });
+  }
+};
+
+int main() {
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  if (verify_all_the_same(f.getOutputBufferData(), 7))
+    return 0;
+
+  return 1;
+}
diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_16_no_input_int.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_16_no_input_int.cpp
new file mode 100644
index 0000000000..6ae1debb67
--- /dev/null
+++ b/SYCL/Basic/feature-tests/inline-asm/asm_16_no_input_int.cpp
@@ -0,0 +1,44 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
+          volatile int output = 0;
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d"
+                       : "=rw"(output));
+#else
+          output = 7;
+#endif
+          C[wiID] = output;
+        });
+  }
+};
+
+int main() {
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  if (verify_all_the_same(f.getOutputBufferData(), 7))
+    return 0;
+
+  return 1;
+}
diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_16_no_opts.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_16_no_opts.cpp
new file mode 100644
index 0000000000..4b6d5146fd
--- /dev/null
+++ b/SYCL/Basic/feature-tests/inline-asm/asm_16_no_opts.cpp
@@ -0,0 +1,45 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
+          for (int i = 0; i < 10; ++i) {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+            asm("fence_sw");
+            C[wiID] += i;
+
+#else
+            C[wiID] += i;
+#endif
+          }
+        });
+  }
+};
+
+int main() {
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  if (verify_all_the_same(f.getOutputBufferData(), 45))
+    return 0;
+
+  return 1;
+}
diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_8_empty.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_8_empty.cpp
new file mode 100644
index 0000000000..97fae0ed4e
--- /dev/null
+++ b/SYCL/Basic/feature-tests/inline-asm/asm_8_empty.cpp
@@ -0,0 +1,40 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+          C[wiID] = 43;
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm volatile("");
+#endif
+        });
+  }
+};
+
+int main() {
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  if (verify_all_the_same(f.getOutputBufferData(), 43))
+    return 0;
+
+  return 1;
+}
diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_8_no_input_int.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_8_no_input_int.cpp
new file mode 100644
index 0000000000..6d1dcbb832
--- /dev/null
+++ b/SYCL/Basic/feature-tests/inline-asm/asm_8_no_input_int.cpp
@@ -0,0 +1,44 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+          volatile int output = 0;
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm volatile("mov (M1,8) %0(0,0)<1> 0x7:d"
+                       : "=rw"(output));
+#else
+          output = 7;
+#endif
+          C[wiID] = output;
+        });
+  }
+};
+
+int main() {
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  if (verify_all_the_same(f.getOutputBufferData(), 7))
+    return 0;
+
+  return 1;
+}
diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_arbitrary_ops_order.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_arbitrary_ops_order.cpp
new file mode 100644
index 0000000000..28d0af1d45
--- /dev/null
+++ b/SYCL/Basic/feature-tests/inline-asm/asm_arbitrary_ops_order.cpp
@@ -0,0 +1,59 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithInputBuffers<T, 3>, WithOutputBuffer<T> {
+  KernelFunctor(const std::vector<T> &input1, const std::vector<T> &input2, const std::vector<T> &input3) : WithInputBuffers<T, 3>(input1, input2, input3), WithOutputBuffer<T>(input1.size()) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto A = this->getInputBuffer(0).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto B = this->getInputBuffer(1).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto C = this->getInputBuffer(2).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto D = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("mad (M1, 8) %0(0, 0)<1> %3(0, 0)<1;1,0> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>"
+              : "=rw"(D[wiID])
+              : "rw"(B[wiID]), "rw"(C[wiID]), "rw"(A[wiID]));
+#else
+          D[wiID] = A[wiID] * B[wiID] + C[wiID];
+#endif
+        });
+  }
+};
+
+int main() {
+  std::vector<dataType> inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE), inputC(DEFAULT_PROBLEM_SIZE);
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) {
+    inputA[i] = i;
+    inputB[i] = i;
+    inputC[i] = DEFAULT_PROBLEM_SIZE - i * i;
+  }
+
+  KernelFunctor<> f(inputA, inputB, inputC);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  auto &D = f.getOutputBufferData();
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) {
+    if (D[i] != inputA[i] * inputB[i] + inputC[i]) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << D[i] << " != " << inputA[i] * inputB[i] + inputC[i] << "\n";
+      return 1;
+    }
+  }
+  return 0;
+}
diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_decl_in_scope.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_decl_in_scope.cpp
new file mode 100644
index 0000000000..db30e20f5e
--- /dev/null
+++ b/SYCL/Basic/feature-tests/inline-asm/asm_decl_in_scope.cpp
@@ -0,0 +1,67 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithInputBuffers<T, 2>, WithOutputBuffer<T> {
+  KernelFunctor(const std::vector<T> &input1, const std::vector<T> &input2) : WithInputBuffers<T, 2>(input1, input2), WithOutputBuffer<T>(input1.size()) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto A = this->getInputBuffer(0).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto B = this->getInputBuffer(1).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()},
+        [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
+    // declaration of temp within and outside the scope
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("{\n"
+              ".decl temp v_type=G type=d num_elts=16 align=GRF\n"
+              "mov (M1, 16) temp(0, 0)<1> %1(0, 0)<1;1,0>\n"
+              "mov (M1, 16) %0(0, 0)<1>  temp(0, 0)<1;1,0>\n"
+              "}\n"
+              ".decl temp v_type=G type=d num_elts=16 align=GRF\n"
+              "mul (M1, 16) temp(0, 0)<1> %2(0, 0)<1;1,0> %0(0, 0)<1;1,0>\n"
+              "mov (M1, 16) %0(0, 0)<1>  temp(0, 0)<1;1,0>\n"
+              : "+rw"(C[wiID])
+              : "rw"(A[wiID]), "rw"(B[wiID]));
+#else
+          C[wiID] = A[wiID];
+          C[wiID] *= B[wiID];
+#endif
+        });
+  }
+};
+
+int main() {
+  std::vector<dataType> inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE);
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) {
+    inputA[i] = i;
+    inputB[i] = 2;
+  }
+
+  KernelFunctor<> f(inputA, inputB);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  auto &C = f.getOutputBufferData();
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) {
+    if (C[i] != inputA[i] * inputB[i]) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << C[i] << " != " << inputA[i] * inputB[i] << "\n";
+      return 1;
+    }
+  }
+  return 0;
+}
diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_float_add.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_float_add.cpp
new file mode 100644
index 0000000000..c23b084317
--- /dev/null
+++ b/SYCL/Basic/feature-tests/inline-asm/asm_float_add.cpp
@@ -0,0 +1,59 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <cmath>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_double;
+
+template <typename T = dataType>
+struct KernelFunctor : WithInputBuffers<T, 2>, WithOutputBuffer<T> {
+  KernelFunctor(const std::vector<T> &input1, const std::vector<T> &input2) : WithInputBuffers<T, 2>(input1, input2), WithOutputBuffer<T>(input1.size()) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto A = this->getInputBuffer(0).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto B = this->getInputBuffer(1).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("add (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>"
+              : "=rw"(C[wiID])
+              : "rw"(A[wiID]), "rw"(B[wiID]));
+#else
+          C[wiID] = A[wiID] + B[wiID];
+#endif
+        });
+  }
+};
+
+int main() {
+  std::vector<dataType> inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE);
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) {
+    inputA[i] = (double)1 / std::pow(2, i);
+    inputB[i] = (double)2 / std::pow(2, i);
+  }
+
+  KernelFunctor<> f(inputA, inputB);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  auto &C = f.getOutputBufferData();
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) {
+    if (C[i] != inputA[i] + inputB[i]) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << C[i] << " != " << inputA[i] + inputB[i] << "\n";
+      return 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_float_imm_arg.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_float_imm_arg.cpp
new file mode 100644
index 0000000000..c9683cf020
--- /dev/null
+++ b/SYCL/Basic/feature-tests/inline-asm/asm_float_imm_arg.cpp
@@ -0,0 +1,56 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <cmath>
+#include <iostream>
+#include <vector>
+
+constexpr double IMM_ARGUMENT = 0.5;
+using dataType = cl::sycl::cl_double;
+
+template <typename T = dataType>
+struct KernelFunctor : WithInputBuffers<T, 1>, WithOutputBuffer<T> {
+  KernelFunctor(const std::vector<T> &input) : WithInputBuffers<T, 1>(input), WithOutputBuffer<T>(input.size()) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto A = this->getInputBuffer(0).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto B = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("mul (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2"
+              : "=rw"(B[wiID])
+              : "rw"(A[wiID]), "rw"(IMM_ARGUMENT));
+#else
+          B[wiID] = A[wiID] * IMM_ARGUMENT;
+#endif
+        });
+  }
+};
+
+int main() {
+  std::vector<dataType> input(DEFAULT_PROBLEM_SIZE);
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++)
+    input[i] = (double)1 / std::pow(2, i);
+
+  KernelFunctor<> f(input);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  auto &B = f.getOutputBufferData();
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) {
+    if (B[i] != input[i] * IMM_ARGUMENT) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << B[i] << " != " << input[i] * IMM_ARGUMENT << "\n";
+      return 1;
+    }
+  }
+  return 0;
+}
diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_float_neg.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_float_neg.cpp
new file mode 100644
index 0000000000..290b089890
--- /dev/null
+++ b/SYCL/Basic/feature-tests/inline-asm/asm_float_neg.cpp
@@ -0,0 +1,57 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_float;
+
+template <typename T = dataType>
+struct KernelFunctor : WithInputBuffers<T, 1>, WithOutputBuffer<T> {
+  KernelFunctor(const std::vector<T> &input) : WithInputBuffers<T, 1>(input), WithOutputBuffer<T>(input.size()) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto A = this->getInputBuffer().template get_access<cl::sycl::access::mode::read>(cgh);
+    auto B = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("mov (M1, 8) %0(0, 0)<1> (-)%1(0, 0)<1;1,0>"
+              : "=rw"(B[wiID])
+              : "rw"(A[wiID]));
+#else
+          B[wiID] = -A[wiID];
+#endif
+        });
+  }
+
+  size_t problem_size = 0;
+};
+
+int main() {
+  std::vector<dataType> input(DEFAULT_PROBLEM_SIZE);
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++)
+    input[i] = 1.0 / i;
+
+  KernelFunctor<> f(input);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  auto &R = f.getOutputBufferData();
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) {
+    if (R[i] != -input[i]) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << R[i] << " != " << -input[i] << "\n";
+      return 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_imm_arg.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_imm_arg.cpp
new file mode 100644
index 0000000000..2dba04d117
--- /dev/null
+++ b/SYCL/Basic/feature-tests/inline-asm/asm_imm_arg.cpp
@@ -0,0 +1,55 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+constexpr int CONST_ARGUMENT = 0xabc;
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithInputBuffers<T, 1>, WithOutputBuffer<T> {
+  KernelFunctor(const std::vector<T> &input) : WithInputBuffers<T, 1>(input), WithOutputBuffer<T>(input.size()) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto A = this->getInputBuffer(0).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto B = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("add (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2"
+              : "=rw"(B[wiID])
+              : "rw"(A[wiID]), "rw"(CONST_ARGUMENT));
+#else
+          B[wiID] = A[wiID] + CONST_ARGUMENT;
+#endif
+        });
+  }
+};
+
+int main() {
+  std::vector<dataType> input(DEFAULT_PROBLEM_SIZE);
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++)
+    input[i] = i;
+
+  KernelFunctor<> f(input);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  auto &B = f.getOutputBufferData();
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) {
+    if (B[i] != input[i] + CONST_ARGUMENT) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << B[i] << " != " << input[i] + CONST_ARGUMENT << "\n";
+      return 1;
+    }
+  }
+  return 0;
+}
diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_mul.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_mul.cpp
new file mode 100644
index 0000000000..726abcf787
--- /dev/null
+++ b/SYCL/Basic/feature-tests/inline-asm/asm_mul.cpp
@@ -0,0 +1,57 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithInputBuffers<T, 2>, WithOutputBuffer<T> {
+  KernelFunctor(const std::vector<T> &input1, const std::vector<T> &input2) : WithInputBuffers<T, 2>(input1, input2), WithOutputBuffer<T>(input1.size()) {}
+  void operator()(cl::sycl::handler &cgh) {
+    auto A = this->getInputBuffer(0).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto B = this->getInputBuffer(1).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("mul (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>"
+              : "=rw"(C[wiID])
+              : "rw"(A[wiID]), "rw"(B[wiID]));
+#else
+          C[wiID] = A[wiID] * B[wiID];
+#endif
+        });
+  }
+};
+
+int main() {
+  std::vector<dataType> inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE);
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) {
+    inputA[i] = i;
+    inputB[i] = DEFAULT_PROBLEM_SIZE - i;
+  }
+
+  KernelFunctor<> f(inputA, inputB);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  auto &C = f.getOutputBufferData();
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) {
+    if (C[i] != inputA[i] * inputB[i]) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << C[i] << " != " << inputA[i] * inputB[i] << "\n";
+      return 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_multiple_instructions.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_multiple_instructions.cpp
new file mode 100644
index 0000000000..e8cf02a529
--- /dev/null
+++ b/SYCL/Basic/feature-tests/inline-asm/asm_multiple_instructions.cpp
@@ -0,0 +1,59 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithInputBuffers<T, 3>, WithOutputBuffer<T> {
+  KernelFunctor(const std::vector<T> &input1, const std::vector<T> &input2, const std::vector<T> &input3) : WithInputBuffers<T, 3>(input1, input2, input3), WithOutputBuffer<T>(input1.size()) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto A = this->getInputBuffer(0).template get_access<cl::sycl::access::mode::read_write>(cgh);
+    auto B = this->getInputBuffer(1).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto C = this->getInputBuffer(2).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto D = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("{\n"
+              "add (M1, 8) %1(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>\n"
+              "add (M1, 8) %1(0, 0)<1> %1(0, 0)<1;1,0> %3(0, 0)<1;1,0>\n"
+              "mov (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0>\n"
+              "}\n"
+              : "=rw"(D[wiID]), "+rw"(A[wiID])
+              : "rw"(B[wiID]), "rw"(C[wiID]));
+#else
+          A[wiID] += B[wiID];
+          A[wiID] += C[wiID];
+          D[wiID] = A[wiID];
+#endif
+        });
+  }
+};
+
+int main() {
+  std::vector<dataType> inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE), inputC(DEFAULT_PROBLEM_SIZE);
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) {
+    inputA[i] = inputB[i] = i;
+    inputC[i] = DEFAULT_PROBLEM_SIZE - 2 * i; // A[i] + B[i] + C[i] = LIST_SIZE
+  }
+
+  KernelFunctor<> f(inputA, inputB, inputC);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  if (verify_all_the_same(f.getOutputBufferData(), (dataType)DEFAULT_PROBLEM_SIZE))
+    return 0;
+
+  return 1;
+}
diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_no_operands.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_no_operands.cpp
new file mode 100644
index 0000000000..3a3a919caa
--- /dev/null
+++ b/SYCL/Basic/feature-tests/inline-asm/asm_no_operands.cpp
@@ -0,0 +1,34 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+class no_operands_kernel;
+
+int main() {
+  // Creating SYCL queue
+  cl::sycl::queue Queue;
+  cl::sycl::device Device = Queue.get_device();
+
+  if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  // Size of index space for kernel
+  cl::sycl::range<1> NumOfWorkItems{16};
+
+  // Submitting command group(work) to queue
+  Queue.submit([&](cl::sycl::handler &cgh) {
+    // Executing kernel
+    cgh.parallel_for<no_operands_kernel>(
+        NumOfWorkItems, [=](cl::sycl::id<1> WIid) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("barrier");
+#endif
+        });
+  });
+}
diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_no_output.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_no_output.cpp
new file mode 100644
index 0000000000..ff6c65d48b
--- /dev/null
+++ b/SYCL/Basic/feature-tests/inline-asm/asm_no_output.cpp
@@ -0,0 +1,47 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+          volatile int local_var = 47;
+          local_var += C[0];
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm volatile("{\n"
+                       ".decl temp v_type=G type=w num_elts=8 align=GRF\n"
+                       "mov (M1,16) temp(0, 0)<1> %0(0,0)<1;1,0>\n"
+                       "}\n" ::"rw"(local_var));
+#else
+          volatile int temp = 0;
+          temp = local_var;
+#endif
+        });
+  }
+};
+
+int main() {
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  if (verify_all_the_same(f.getOutputBufferData(), 0))
+    return 0;
+
+  return 1;
+}
diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_plus_mod.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_plus_mod.cpp
new file mode 100644
index 0000000000..f65cda777e
--- /dev/null
+++ b/SYCL/Basic/feature-tests/inline-asm/asm_plus_mod.cpp
@@ -0,0 +1,58 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithInputBuffers<T, 1>, WithOutputBuffer<T> {
+  KernelFunctor(const std::vector<T> &input1, const std::vector<T> &input2) : WithInputBuffers<T, 1>(input1), WithOutputBuffer<T>(input2) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto A = this->getInputBuffer(0).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto B = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("add (M1, 16) %0(0, 0)<1> %0(0, 0)<1;1,0> %1(0, 0)<1;1,0>"
+              : "+rw"(B[wiID])
+              : "rw"(A[wiID]));
+#else
+          B[wiID] += A[wiID];
+#endif
+        });
+  }
+};
+
+int main() {
+  std::vector<dataType> inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE), R(DEFAULT_PROBLEM_SIZE);
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) {
+    inputA[i] = i;
+    inputB[i] = DEFAULT_PROBLEM_SIZE - i;
+    R[i] = inputA[i] + inputB[i];
+  }
+
+  KernelFunctor<> f(inputA, inputB);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  auto &B = f.getOutputBufferData();
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) {
+    if (B[i] != R[i]) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << B[i] << " != " << R[i] << "\n";
+      return 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/SYCL/Basic/feature-tests/inline-asm/include/asmhelper.h b/SYCL/Basic/feature-tests/inline-asm/include/asmhelper.h
new file mode 100644
index 0000000000..75585e1611
--- /dev/null
+++ b/SYCL/Basic/feature-tests/inline-asm/include/asmhelper.h
@@ -0,0 +1,128 @@
+#include <CL/sycl.hpp>
+
+#include <iostream>
+#include <memory>
+#include <vector>
+
+constexpr const size_t DEFAULT_PROBLEM_SIZE = 16;
+
+template <typename T>
+struct WithOutputBuffer {
+  WithOutputBuffer(size_t size) {
+    _output_buffer_data.resize(size);
+    _output_buffer.reset(new cl::sycl::buffer<T>(_output_buffer_data.data(), _output_buffer_data.size()));
+  }
+
+  WithOutputBuffer(const std::vector<T> &data) {
+    _output_buffer_data = data;
+    _output_buffer.reset(new cl::sycl::buffer<T>(_output_buffer_data.data(), _output_buffer_data.size()));
+  }
+
+  const std::vector<T> &getOutputBufferData() {
+    // We cannoe access the data until the buffer is still alive
+    _output_buffer.reset();
+    return _output_buffer_data;
+  }
+
+  size_t getOutputBufferSize() const {
+    return _output_buffer_data.size();
+  }
+
+protected:
+  cl::sycl::buffer<T> &getOutputBuffer() {
+    return *_output_buffer;
+  }
+
+  // Functor is being passed by-copy into cl::sycl::queue::submit and destroyed
+  // one more time in there. We need to make sure that buffer is only released
+  // once.
+  std::shared_ptr<cl::sycl::buffer<T>> _output_buffer = nullptr;
+  std::vector<T> _output_buffer_data;
+};
+
+template <typename T, size_t N>
+struct WithInputBuffers {
+
+  template <typename... Args>
+  WithInputBuffers(Args... inputs) {
+    static_assert(sizeof...(Args) == N, "All input buffers must be initialized");
+    constructorHelper<0>(inputs...);
+  }
+
+  cl::sycl::buffer<T> &getInputBuffer(size_t i = 0) {
+    return *_input_buffers[i];
+  }
+
+protected:
+  std::shared_ptr<cl::sycl::buffer<T>> _input_buffers[N] = {nullptr};
+  std::vector<T> _input_buffers_data[N];
+
+private:
+  template <int Index, typename... Args>
+  void constructorHelper(const std::vector<T> &data, Args... rest) {
+    _input_buffers_data[Index] = data;
+    _input_buffers[Index].reset(new cl::sycl::buffer<T>(_input_buffers_data[Index].data(), _input_buffers_data[Index].size()));
+    constructorHelper<Index + 1>(rest...);
+  }
+
+  template <int Index>
+  void constructorHelper() {
+    // nothing to do, recursion stop
+  }
+};
+
+bool isInlineASMSupported(sycl::device Device) {
+
+  sycl::string_class DriverVersion = Device.get_info<sycl::info::device::driver_version>();
+  sycl::string_class DeviceVendorName = Device.get_info<sycl::info::device::vendor>();
+  // TODO: query for some extension/capability/whatever once interface is
+  // defined
+  if (DeviceVendorName.find("Intel") == sycl::string_class::npos)
+    return false;
+  if (DriverVersion.length() < 5)
+    return false;
+  if (DriverVersion[2] != '.')
+    return false;
+  if (std::stoi(DriverVersion.substr(0, 2), nullptr, 10) < 20 || std::stoi(DriverVersion.substr(3, 2), nullptr, 10) < 12)
+    return false;
+  return true;
+}
+
+/// checks if device suppots inline asm feature and launches a test
+///
+/// \returns false if test wasn't launched (i.e.was skipped) and true otherwise
+template <typename F>
+bool launchInlineASMTest(F &f, bool requires_particular_sg_size = true) {
+  try {
+    cl::sycl::queue deviceQueue(cl::sycl::gpu_selector{});
+    cl::sycl::device device = deviceQueue.get_device();
+
+#if defined(INLINE_ASM)
+    if (!isInlineASMSupported(device)) {
+      std::cout << "Skipping test\n";
+      return false;
+    }
+#endif
+
+    if (requires_particular_sg_size && !device.has_extension("cl_intel_required_subgroup_size")) {
+      std::cout << "Skipping test\n";
+      return false;
+    }
+
+    deviceQueue.submit(f).wait();
+  } catch (cl::sycl::exception &e) {
+    std::cerr << "Caught exception: " << e.what() << std::endl;
+  }
+  return true;
+}
+
+template <typename T>
+bool verify_all_the_same(const std::vector<T> &input, T reference_value) {
+  for (int i = 0; i < input.size(); ++i)
+    if (input[i] != reference_value) {
+      std::cerr << "At index: " << i << " ";
+      std::cerr << input[i] << " != " << reference_value << "\n";
+      return false;
+    }
+  return true;
+}
diff --git a/SYCL/Basic/feature-tests/inline-asm/letter_example.cpp b/SYCL/Basic/feature-tests/inline-asm/letter_example.cpp
new file mode 100644
index 0000000000..22bf26648e
--- /dev/null
+++ b/SYCL/Basic/feature-tests/inline-asm/letter_example.cpp
@@ -0,0 +1,66 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+
+constexpr size_t problem_size = 16;
+
+class kernel_name;
+
+int main() {
+  cl::sycl::queue q;
+  cl::sycl::device Device = q.get_device();
+
+  if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  auto ctx = q.get_context();
+  int *a = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx);
+  for (int i = 0; i < problem_size; i++) {
+    a[i] = i;
+  }
+  q.submit([&](cl::sycl::handler &cgh) {
+     cgh.parallel_for<kernel_name>(
+         cl::sycl::range<1>(problem_size), [=](cl::sycl::id<1> idx)
+                                               [[cl::intel_reqd_sub_group_size(16)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+                                                 int i = idx[0];
+                                                 asm volatile("{\n.decl V52 v_type=G type=d num_elts=16 align=GRF\n"
+                                                              "svm_gather.4.1 (M1, 16) %0.0 V52.0\n"
+                                                              "add(M1, 16) V52(0, 0)<1> V52(0, 0)<1; 1, 0> 0x1:w\n"
+                                                              "svm_scatter.4.1 (M1, 16) %0.0 V52.0\n}"
+                                                              :
+                                                              : "rw"(&a[i]));
+#else
+                                                 a[idx[0]]++;
+#endif
+                                               });
+   }).wait();
+
+  bool currect = true;
+  for (int i = 0; i < problem_size; i++) {
+    if (a[i] != (i + 1)) {
+      currect = false;
+      std::cerr << "error in a[" << i << "]="
+                << a[i] << "!=" << (i + 1) << std::endl;
+      break;
+    }
+  }
+
+  if (!currect) {
+    std::cerr << "Error" << std::endl;
+    cl::sycl::free(a, ctx);
+    return 1;
+  }
+
+  std::cerr << "Pass" << std::endl;
+  cl::sycl::free(a, ctx);
+  return 0;
+}
diff --git a/SYCL/Basic/feature-tests/inline-asm/malloc_shared_32.cpp b/SYCL/Basic/feature-tests/inline-asm/malloc_shared_32.cpp
new file mode 100644
index 0000000000..8f058851c2
--- /dev/null
+++ b/SYCL/Basic/feature-tests/inline-asm/malloc_shared_32.cpp
@@ -0,0 +1,92 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+
+constexpr size_t problem_size = 32;
+
+class kernel_name;
+
+int main() {
+  cl::sycl::queue q;
+
+  cl::sycl::device Device = q.get_device();
+
+  if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  auto ctx = q.get_context();
+  int *a = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx);
+  int *b = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx);
+  int *c = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx);
+  for (int i = 0; i < problem_size; i++) {
+    b[i] = -10;
+    a[i] = i;
+    c[i] = i;
+  }
+
+  q.submit([&](cl::sycl::handler &cgh) {
+     cgh.parallel_for<kernel_name>(
+         cl::sycl::range<1>(problem_size),
+         [=](cl::sycl::id<1> idx)
+             [[cl::intel_reqd_sub_group_size(32)]] {
+               int i = idx[0];
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+               asm volatile(R"a(
+    {
+        .decl V52 v_type=G type=d num_elts=16 align=GRF
+        .decl V53 v_type=G type=d num_elts=16 align=GRF
+        .decl V54 v_type=G type=d num_elts=16 align=GRF
+        .decl V55 v_type=G type=d num_elts=16 align=GRF
+        .decl V56 v_type=G type=d num_elts=16 align=GRF
+        .decl V57 v_type=G type=d num_elts=16 align=GRF
+        svm_gather.4.1 (M1, 16) %2.0 V54.0
+        svm_gather.4.1 (M1, 16) %3.0 V55.0
+        svm_gather.4.1 (M1, 16) %4.0 V56.0
+        svm_gather.4.1 (M1, 16) %5.0 V57.0
+        mul (M1, 16) V52(0,0)<1> V54(0,0)<1;1,0> V56(0,0)<1;1,0>
+        mul (M1, 16) V53(0,0)<1> V55(0,0)<1;1,0> V57(0,0)<1;1,0>
+        svm_scatter.4.1 (M1, 16) %0.0 V52.0
+        svm_scatter.4.1 (M1, 16) %1.0 V53.0
+    }
+    )a" ::"rw"(&b[i]),
+                            "rw"(&b[i] + 16), "rw"(&a[i]), "rw"(&a[i] + 16), "rw"(&c[i]),
+                            "rw"(&c[i] + 16));
+#else
+               b[i] = a[i] * c[i];
+#endif
+             });
+   }).wait();
+
+  bool currect = true;
+  for (int i = 0; i < problem_size; i++) {
+    if (b[i] != a[i] * b[i]) {
+      currect = false;
+      std::cerr << "error in a[" << i << "]="
+                << b[i] << "!=" << a[i] * b[i] << std::endl;
+      break;
+    }
+  }
+
+  if (!currect) {
+    std::cerr << "Error" << std::endl;
+    cl::sycl::free(a, ctx);
+    cl::sycl::free(b, ctx);
+    cl::sycl::free(c, ctx);
+    return 1;
+  }
+
+  std::cerr << "Pass" << std::endl;
+  cl::sycl::free(a, ctx);
+  cl::sycl::free(b, ctx);
+  cl::sycl::free(c, ctx);
+  return 0;
+}
diff --git a/SYCL/Basic/feature-tests/inline-asm/malloc_shared_in_out_dif.cpp b/SYCL/Basic/feature-tests/inline-asm/malloc_shared_in_out_dif.cpp
new file mode 100644
index 0000000000..a6994bd379
--- /dev/null
+++ b/SYCL/Basic/feature-tests/inline-asm/malloc_shared_in_out_dif.cpp
@@ -0,0 +1,69 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+
+constexpr size_t problem_size = 100;
+
+class kernel_name;
+
+int main() {
+  cl::sycl::queue q;
+
+  cl::sycl::device Device = q.get_device();
+
+  if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  auto ctx = q.get_context();
+  int *a = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx);
+  int *b = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx);
+  for (int i = 0; i < problem_size; i++) {
+    b[i] = -1;
+    a[i] = i;
+  }
+
+  q.submit([&](cl::sycl::handler &cgh) {
+     cgh.parallel_for<kernel_name>(
+         cl::sycl::range<1>(problem_size), [=](cl::sycl::id<1> idx) [[cl::intel_reqd_sub_group_size(16)]] {
+           int i = idx[0];
+           volatile int tmp = a[i];
+           tmp += 1;
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+           asm volatile(" add (M1, 16) %0(0,0)<1> %0(0,0)<1;1,0> %1(0,0)<1;1,0>"
+                        : "+rw"(b[i])
+                        : "rw"(tmp));
+#else
+           b[i] += tmp;
+#endif
+         });
+   }).wait();
+
+  bool currect = true;
+  for (int i = 0; i < problem_size; i++) {
+    if (b[i] != a[i]) {
+      currect = false;
+      std::cerr << "error in a[" << i << "]="
+                << b[i] << "!=" << a[i] << std::endl;
+      break;
+    }
+  }
+
+  if (!currect) {
+    std::cerr << "Error" << std::endl;
+    cl::sycl::free(a, ctx);
+    cl::sycl::free(b, ctx);
+    return 1;
+  }
+
+  std::cerr << "Pass" << std::endl;
+  cl::sycl::free(a, ctx);
+  cl::sycl::free(b, ctx);
+  return 0;
+}
diff --git a/SYCL/Basic/feature-tests/inline-asm/malloc_shared_no_input.cpp b/SYCL/Basic/feature-tests/inline-asm/malloc_shared_no_input.cpp
new file mode 100644
index 0000000000..22cd47abd6
--- /dev/null
+++ b/SYCL/Basic/feature-tests/inline-asm/malloc_shared_no_input.cpp
@@ -0,0 +1,61 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+
+constexpr size_t problem_size = 16;
+
+class kernel_name;
+
+int main() {
+  cl::sycl::queue q;
+  cl::sycl::device Device = q.get_device();
+
+  if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  auto ctx = q.get_context();
+  int *a = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx);
+  for (int i = 0; i < problem_size; i++)
+    a[i] = i;
+
+  q.submit([&](cl::sycl::handler &cgh) {
+     cgh.parallel_for<kernel_name>(
+         cl::sycl::range<1>(problem_size), [=](cl::sycl::id<1> idx) [[cl::intel_reqd_sub_group_size(16)]] {
+           int i = idx[0];
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+           asm volatile("mov (M1, 16) %0(0,0)<1> 0x7:d"
+                        : "=rw"(a[i]));
+#else
+           a[i] = 7;
+#endif
+         });
+   }).wait();
+
+  bool currect = true;
+  for (int i = 0; i < problem_size; i++) {
+    if (a[i] != 7) {
+      currect = false;
+      std::cerr << "error in a[" << i << "]="
+                << a[i] << "!=" << 7 << std::endl;
+      break;
+    }
+  }
+
+  if (!currect) {
+    std::cerr << "Error" << std::endl;
+    cl::sycl::free(a, ctx);
+    return 1;
+  }
+
+  std::cerr << "Pass" << std::endl;
+  cl::sycl::free(a, ctx);
+  return 0;
+}
diff --git a/SYCL/Basic/fpga_tests/Inputs/fpga_device.cpp b/SYCL/Basic/fpga_tests/Inputs/fpga_device.cpp
new file mode 100644
index 0000000000..7a5658c8fe
--- /dev/null
+++ b/SYCL/Basic/fpga_tests/Inputs/fpga_device.cpp
@@ -0,0 +1,24 @@
+//==--------------- fpga_device.cpp - AOT compilation for fpga -------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "CL/sycl.hpp"
+
+using namespace cl::sycl;
+
+const double big[] = {3, 2, 1, 5, 6, 7};
+void foo(double &result, queue q, int x) {
+  buffer<double> buf(&result, 1);
+  buffer<double, 1> big_buf(big, sizeof(big) / sizeof(double));
+  q.submit([&](handler &cgh) {
+    auto acc = buf.get_access<access::mode::discard_write>(cgh);
+    auto big_acc = big_buf.get_access<access::mode::read>(cgh);
+    cgh.single_task<class test>([=]() {
+      acc[0] = big_acc[x];
+    });
+  });
+}
diff --git a/SYCL/Basic/fpga_tests/Inputs/fpga_host.cpp b/SYCL/Basic/fpga_tests/Inputs/fpga_host.cpp
new file mode 100644
index 0000000000..ab24b26c22
--- /dev/null
+++ b/SYCL/Basic/fpga_tests/Inputs/fpga_host.cpp
@@ -0,0 +1,23 @@
+//==--------------- fpga_host.cpp - AOT compilation for fpga ---------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "CL/sycl.hpp"
+#include <cassert>
+
+using namespace cl::sycl;
+
+void foo(double &, queue q, int x);
+
+int main(void) {
+  queue q(accelerator_selector{});
+
+  double result;
+  foo(result, q, 3);
+  assert(result == 5);
+  return 0;
+}
diff --git a/SYCL/Basic/fpga_tests/fpga_aocx.cpp b/SYCL/Basic/fpga_tests/fpga_aocx.cpp
new file mode 100644
index 0000000000..a5c8a3d5ce
--- /dev/null
+++ b/SYCL/Basic/fpga_tests/fpga_aocx.cpp
@@ -0,0 +1,24 @@
+//==----- fpga_aocx.cpp - AOT compilation for fpga using aoc with aocx -----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: aoc, accelerator
+
+/// E2E test for AOCX creation/use/run for FPGA
+// Produce an archive with device (AOCX) image
+// RUN: %clangxx -fsycl -fintelfpga -fsycl-link=image %S/Inputs/fpga_device.cpp -o %t_image.a
+// Produce a host object
+// RUN: %clangxx -fsycl -fintelfpga %S/Inputs/fpga_host.cpp -c -o %t.o
+
+// AOCX with source
+// RUN: %clangxx -fsycl -fintelfpga %S/Inputs/fpga_host.cpp %t_image.a -o %t_aocx_src.out
+// AOCX with object
+// RUN: %clangxx -fsycl -fintelfpga %t.o %t_image.a -o %t_aocx_obj.out
+//
+// RUN: env SYCL_DEVICE_TYPE=ACC %t_aocx_src.out
+// RUN: env SYCL_DEVICE_TYPE=ACC %t_aocx_obj.out
+// XFAIL:*
diff --git a/SYCL/Basic/fpga_tests/fpga_aocx_win.cpp b/SYCL/Basic/fpga_tests/fpga_aocx_win.cpp
new file mode 100644
index 0000000000..675cebebcd
--- /dev/null
+++ b/SYCL/Basic/fpga_tests/fpga_aocx_win.cpp
@@ -0,0 +1,24 @@
+//==--- fpga_aocx_win.cpp - AOT compilation for fpga using aoc with aocx ---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: aoc, accelerator
+// REQUIRES: system-windows
+
+/// E2E test for AOCX creation/use/run for FPGA
+// Produce an archive with device (AOCX) image
+// RUN: %clang_cl -fsycl -fintelfpga -fsycl-link=image %S/Inputs/fpga_device.cpp -o %t_image.lib
+// Produce a host object
+// RUN: %clang_cl -fsycl -fintelfpga -DHOST_PART %S/Inputs/fpga_host.cpp -c -o %t.obj
+
+// AOCX with source
+// RUN: %clang_cl -fsycl -fintelfpga -DHOST_PART %S/Inputs/fpga_host.cpp %t_image.lib -o %t_aocx_src.out
+// AOCX with object
+// RUN: %clang_cl -fsycl -fintelfpga %t.obj %t_image.lib -o %t_aocx_obj.out
+//
+// RUN: env SYCL_DEVICE_TYPE=ACC %t_aocx_src.out
+// RUN: env SYCL_DEVICE_TYPE=ACC %t_aocx_obj.out
diff --git a/SYCL/Basic/fpga_tests/fpga_io_pipes.cpp b/SYCL/Basic/fpga_tests/fpga_io_pipes.cpp
new file mode 100644
index 0000000000..9826e3cd93
--- /dev/null
+++ b/SYCL/Basic/fpga_tests/fpga_io_pipes.cpp
@@ -0,0 +1,134 @@
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//==------------ fpga_io_pipes.cpp - SYCL FPGA pipes test ------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: accelerator
+// XFAIL: accelerator
+#include <CL/sycl.hpp>
+#include <CL/sycl/intel/fpga_extensions.hpp>
+#include <fstream>
+#include <iostream>
+
+#include "io_pipe_def.h"
+
+// TODO: run is disabled, since no support added in FPGA backend yet. Check
+// implementation correctness from CXX and SYCL languages perspective.
+
+// This test is supposed to be run only on Intel FPGA emulator. Change it when
+// we have more experience with IO pipe feature in SYCL.
+// The emulator creates files (one for I pipe, another for O pipe) with the
+// appropriate naming, where a data flowing through a pipe can be stored.
+// So in the test we need to create these files and use them appropriately.
+// The name is taken as IO pipe ID.
+const size_t InputData = 42;
+const std::string InputFileName = "0.txt";
+const std::string OutputFileName = "1.txt";
+
+void createInputFile(const std::string &filename) {
+  std::ofstream Input(filename);
+  if (Input.is_open()) {
+    Input << InputData;
+    Input.close();
+  }
+}
+
+int validateOutputFile(const std::string &filename) {
+  std::ifstream Output(filename);
+  std::string Line;
+  std::vector<size_t> Result;
+  if (Output.is_open()) {
+    // In the test we write only one number into the pipe, but a backend might
+    // have a bug of incorrect interpretetion of capacity of the pipe. In this
+    // case let's read all the lines of the output file to catch this.
+    while (std::getline(Output, Line))
+      Result.push_back(stoi(Line));
+    Output.close();
+  }
+  if (Result.size() != 1 || Result[0] != InputData) {
+    std::cout << "Result mismatches " << Result[0] << " Vs expected "
+              << InputData << std::endl;
+    return -1;
+  }
+
+  return 0;
+}
+
+// Test for simple non-blocking pipes
+int test_io_nb_pipe(cl::sycl::queue Queue) {
+  createInputFile(InputFileName);
+
+  cl::sycl::buffer<int, 1> writeBuf(1);
+  Queue.submit([&](cl::sycl::handler &cgh) {
+    auto write_acc = writeBuf.get_access<cl::sycl::access::mode::write>(cgh);
+
+    cgh.single_task<class nb_io_transfer>([=]() {
+      bool SuccessCodeI = false;
+      do {
+        write_acc[0] = intelfpga::ethernet_read_pipe::read(SuccessCodeI);
+      } while (!SuccessCodeI);
+      bool SuccessCodeO = false;
+      do {
+        intelfpga::ethernet_write_pipe::write(write_acc[0], SuccessCodeO);
+      } while (!SuccessCodeO);
+    });
+  });
+
+  auto readHostBuffer = writeBuf.get_access<cl::sycl::access::mode::read>();
+  if (readHostBuffer[0] != InputData) {
+    std::cout << "Read from a file mismatches " << readHostBuffer[0]
+              << " Vs expected " << InputData << std::endl;
+
+    return -1;
+  }
+
+  return validateOutputFile(OutputFileName);
+}
+
+// Test for simple blocking pipes
+int test_io_bl_pipe(cl::sycl::queue Queue) {
+  createInputFile(InputFileName);
+
+  cl::sycl::buffer<int, 1> writeBuf(1);
+  Queue.submit([&](cl::sycl::handler &cgh) {
+    auto write_acc = writeBuf.get_access<cl::sycl::access::mode::write>(cgh);
+
+    cgh.single_task<class bl_io_transfer>([=]() {
+      write_acc[0] = intelfpga::ethernet_read_pipe::read();
+      intelfpga::ethernet_write_pipe::write(write_acc[0]);
+    });
+  });
+
+  auto readHostBuffer = writeBuf.get_access<cl::sycl::access::mode::read>();
+  if (readHostBuffer[0] != InputData) {
+    std::cout << "Read from a file mismatches " << readHostBuffer[0]
+              << " Vs expected " << InputData << std::endl;
+
+    return -1;
+  }
+
+  return validateOutputFile(OutputFileName);
+}
+
+int main() {
+  cl::sycl::queue Queue{cl::sycl::intel::fpga_emulator_selector{}};
+
+  if (!Queue.get_device()
+           .get_info<cl::sycl::info::device::kernel_kernel_pipe_support>()) {
+    std::cout << "SYCL_INTEL_data_flow_pipes not supported, skipping"
+              << std::endl;
+    return 0;
+  }
+
+  // Non-blocking pipes
+  int Result = test_io_nb_pipe(Queue);
+
+  // Blocking pipes
+  Result &= test_io_bl_pipe(Queue);
+
+  return Result;
+}
diff --git a/SYCL/Basic/fpga_tests/fpga_pipes.cpp b/SYCL/Basic/fpga_tests/fpga_pipes.cpp
new file mode 100644
index 0000000000..8872f296c9
--- /dev/null
+++ b/SYCL/Basic/fpga_tests/fpga_pipes.cpp
@@ -0,0 +1,326 @@
+// RUN: %clangxx -fsycl %s -o %t.out -fsycl-targets=%sycl_triple
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// REQUIRES: cpu, gpu, accelerator
+//==------------- fpga_pipes.cpp - SYCL FPGA pipes test --------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include <CL/sycl.hpp>
+#include <CL/sycl/intel/fpga_extensions.hpp>
+#include <iostream>
+
+// Size of an array passing through a pipe
+constexpr size_t N = 10;
+
+// For simple non-blocking pipes with explicit type
+class some_nb_pipe;
+
+// For non-blocking pipes created with namespaces set
+namespace some {
+class nb_pipe;
+}
+
+// For non-blocking template pipes
+template<int N>
+class templ_nb_pipe;
+
+// For non-blocking multiple pipes
+template<int N>
+using PipeMulNb = cl::sycl::intel::pipe<class templ_nb_pipe<N>, int>;
+
+// For simple blocking pipes with explicit type
+class some_bl_pipe;
+
+// For blocking pipes created with namespaces set
+namespace some {
+class bl_pipe;
+}
+
+// For blocking template pipes
+template<int N>
+class templ_bl_pipe;
+
+// For blocking multiple pipes
+template<int N>
+using PipeMulBl = cl::sycl::intel::pipe<class templ_bl_pipe<N>, int>;
+
+// Kernel names
+template <int TestNumber, int KernelNumber = 0>
+class writer;
+template <int TestNumber, int KernelNumber = 0>
+class reader;
+
+// Test for simple non-blocking pipes
+template<typename PipeName, int TestNumber>
+int test_simple_nb_pipe(cl::sycl::queue Queue) {
+  int data[] = {0};
+
+  using Pipe = cl::sycl::intel::pipe<PipeName, int>;
+
+  cl::sycl::buffer<int, 1> readBuf(data, 1);
+  Queue.submit([&](cl::sycl::handler &cgh) {
+    cgh.single_task<class writer<TestNumber>>([=]() {
+      bool SuccessCode = false;
+      do {
+        Pipe::write(42, SuccessCode);
+      } while (!SuccessCode);
+    });
+  });
+
+  cl::sycl::buffer<int, 1> writeBuf(data, 1);
+  Queue.submit([&](cl::sycl::handler &cgh) {
+    auto write_acc = writeBuf.get_access<cl::sycl::access::mode::write>(cgh);
+
+    cgh.single_task<class reader<TestNumber>>([=]() {
+      bool SuccessCode = false;
+      do {
+        write_acc[0] = Pipe::read(SuccessCode);
+      } while (!SuccessCode);
+    });
+  });
+
+  auto readHostBuffer = writeBuf.get_access<cl::sycl::access::mode::read>();
+  if (readHostBuffer[0] != 42) {
+    std::cout << "Test: " << TestNumber << "\nResult mismatches "
+              << readHostBuffer[0] << " Vs expected " << 42 << std::endl;
+
+    return -1;
+  }
+
+  return 0;
+}
+
+// Test for multiple non-blocking pipes
+template<int TestNumber>
+int test_multiple_nb_pipe(cl::sycl::queue Queue) {
+  int data[] = {0};
+
+  Queue.submit([&](cl::sycl::handler &cgh) {
+    cgh.single_task<class writer<TestNumber, /*KernelNumber*/ 1>>([=]() {
+      bool SuccessCode = false;
+      do {
+        PipeMulNb<1>::write(19, SuccessCode);
+      } while (!SuccessCode);
+    });
+  });
+
+  Queue.submit([&](cl::sycl::handler &cgh) {
+    cgh.single_task<class writer<TestNumber, /*KernelNumber*/ 2>>([=]() {
+      bool SuccessCode = false;
+      do {
+        PipeMulNb<2>::write(23, SuccessCode);
+      } while (!SuccessCode);
+    });
+  });
+
+  cl::sycl::buffer<int, 1> writeBuf(data, 1);
+  Queue.submit([&](cl::sycl::handler &cgh) {
+    auto write_acc = writeBuf.get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.single_task<class reader<TestNumber>>([=]() {
+      bool SuccessCodeA = false;
+      int Value = 0;
+      do {
+        Value = PipeMulNb<1>::read(SuccessCodeA);
+      } while (!SuccessCodeA);
+      write_acc[0] = Value;
+      bool SuccessCodeB = false;
+      do {
+        Value = PipeMulNb<2>::read(SuccessCodeB);
+      } while (!SuccessCodeB);
+      write_acc[0] += Value;
+    });
+  });
+
+  auto readHostBuffer = writeBuf.get_access<cl::sycl::access::mode::read>();
+  if (readHostBuffer[0] != 42) {
+    std::cout << "Test: " << TestNumber << "\nResult mismatches "
+              << readHostBuffer[0] << " Vs expected " << 42 << std::endl;
+
+    return -1;
+  }
+
+  return 0;
+}
+
+// Test for array passing through a non-blocking pipe
+template<int TestNumber>
+int test_array_th_nb_pipe(cl::sycl::queue Queue) {
+  int data[N] = {0};
+  using AnotherNbPipe = cl::sycl::intel::pipe<class another_nb_pipe, int>;
+
+  Queue.submit([&](cl::sycl::handler &cgh) {
+    cgh.single_task<class writer<TestNumber>>([=]() {
+      bool SuccessCode = false;
+      for (size_t i = 0; i != N; ++i) {
+        do {
+          AnotherNbPipe::write(i, SuccessCode);
+        } while (!SuccessCode);
+      }
+    });
+  });
+
+  cl::sycl::buffer<int, 1> writeBuf(data, N);
+  Queue.submit([&](cl::sycl::handler &cgh) {
+    auto write_acc = writeBuf.get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.single_task<class reader<TestNumber>>([=]() {
+      for (size_t i = 0; i != N; ++i) {
+        bool SuccessCode = false;
+        do {
+          write_acc[i] = AnotherNbPipe::read(SuccessCode);
+        } while (!SuccessCode);
+      }
+    });
+  });
+
+  auto readHostBuffer = writeBuf.get_access<cl::sycl::access::mode::read>();
+  for (size_t i = 0; i != N; ++i) {
+    if (readHostBuffer[i] != i)
+      std::cout << "Test: " << TestNumber << "\nResult mismatches "
+                << readHostBuffer[i] << " Vs expected " << i << std::endl;
+    return -1;
+  }
+
+  return 0;
+}
+
+// Test for simple blocking pipes
+template<typename PipeName, int TestNumber>
+int test_simple_bl_pipe(cl::sycl::queue Queue) {
+  int data[] = {0};
+
+  using Pipe = cl::sycl::intel::pipe<PipeName, int>;
+
+  cl::sycl::buffer<int, 1> readBuf(data, 1);
+  Queue.submit([&](cl::sycl::handler &cgh) {
+    cgh.single_task<class writer<TestNumber>>([=]() {
+      Pipe::write(42);
+    });
+  });
+
+  cl::sycl::buffer<int, 1> writeBuf(data, 1);
+  Queue.submit([&](cl::sycl::handler &cgh) {
+    auto write_acc = writeBuf.get_access<cl::sycl::access::mode::write>(cgh);
+
+    cgh.single_task<class reader<TestNumber>>([=]() {
+      write_acc[0] = Pipe::read();
+    });
+  });
+
+  auto readHostBuffer = writeBuf.get_access<cl::sycl::access::mode::read>();
+  if (readHostBuffer[0] != 42) {
+    std::cout << "Test: " << TestNumber << "\nResult mismatches "
+              << readHostBuffer[0] << " Vs expected " << 42 << std::endl;
+
+    return -1;
+  }
+
+  return 0;
+}
+
+// Test for multiple blocking pipes
+template<int TestNumber>
+int test_multiple_bl_pipe(cl::sycl::queue Queue) {
+  int data[] = {0};
+
+  Queue.submit([&](cl::sycl::handler &cgh) {
+    cgh.single_task<class writer<TestNumber, /*KernelNumber*/ 1>>([=]() {
+      PipeMulBl<1>::write(19);
+    });
+  });
+
+  Queue.submit([&](cl::sycl::handler &cgh) {
+    cgh.single_task<class writer<TestNumber, /*KernelNumber*/ 2>>([=]() {
+      PipeMulBl<2>::write(23);
+    });
+  });
+
+  cl::sycl::buffer<int, 1> writeBuf(data, 1);
+  Queue.submit([&](cl::sycl::handler &cgh) {
+    auto write_acc = writeBuf.get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.single_task<class reader<TestNumber>>([=]() {
+      write_acc[0] = PipeMulBl<1>::read();
+      write_acc[0] += PipeMulBl<2>::read();
+    });
+  });
+
+  auto readHostBuffer = writeBuf.get_access<cl::sycl::access::mode::read>();
+  if (readHostBuffer[0] != 42) {
+    std::cout << "Test: " << TestNumber << "\nResult mismatches "
+              << readHostBuffer[0] << " Vs expected " << 42 << std::endl;
+
+    return -1;
+  }
+
+  return 0;
+}
+
+// Test for array passing through a blocking pipe
+template<int TestNumber>
+int test_array_th_bl_pipe(cl::sycl::queue Queue) {
+  int data[N] = {0};
+  using AnotherBlPipe = cl::sycl::intel::pipe<class another_bl_pipe, int>;
+
+  Queue.submit([&](cl::sycl::handler &cgh) {
+    cgh.single_task<class writer<TestNumber>>([=]() {
+      for (size_t i = 0; i != N; ++i)
+        AnotherBlPipe::write(i);
+    });
+  });
+
+  cl::sycl::buffer<int, 1> writeBuf(data, N);
+  Queue.submit([&](cl::sycl::handler &cgh) {
+    auto write_acc = writeBuf.get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.single_task<class reader<TestNumber>>([=]() {
+      for (size_t i = 0; i != N; ++i)
+        write_acc[i] = AnotherBlPipe::read();
+    });
+  });
+
+  auto readHostBuffer = writeBuf.get_access<cl::sycl::access::mode::read>();
+  for (size_t i = 0; i != N; ++i) {
+    if (readHostBuffer[i] != i)
+      std::cout << "Test: " << TestNumber << "\nResult mismatches "
+                << readHostBuffer[i] << " Vs expected " << i << std::endl;
+    return -1;
+  }
+
+  return 0;
+}
+
+int main() {
+  cl::sycl::queue Queue;
+
+  if (!Queue.get_device()
+           .get_info<cl::sycl::info::device::kernel_kernel_pipe_support>()) {
+    std::cout << "SYCL_INTEL_data_flow_pipes not supported, skipping"
+              << std::endl;
+    return 0;
+  }
+
+  // Non-blocking pipes
+  int Result = test_simple_nb_pipe<some_nb_pipe, /*test number*/ 1>(Queue);
+  Result &= test_simple_nb_pipe<some::nb_pipe, /*test number*/ 2>(Queue);
+  class forward_nb_pipe;
+  Result &= test_simple_nb_pipe<forward_nb_pipe, /*test number*/ 3>(Queue);
+  Result &= test_simple_nb_pipe<templ_nb_pipe<0>, /*test number*/ 4>(Queue);
+  Result &= test_multiple_nb_pipe</*test number*/ 5>(Queue);
+
+  // Blocking pipes
+  Result &= test_simple_bl_pipe<some_bl_pipe, /*test number*/ 6>(Queue);
+  Result &= test_simple_bl_pipe<some::bl_pipe, /*test number*/ 7>(Queue);
+  class forward_bl_pipe;
+  Result &= test_simple_bl_pipe<forward_bl_pipe, /*test number*/ 8>(Queue);
+  Result &= test_simple_bl_pipe<templ_bl_pipe<0>, /*test number*/ 9>(Queue);
+  Result &= test_multiple_bl_pipe</*test number*/ 10>(Queue);
+
+  // Test for an array data passing through a pipe
+  Result &= test_array_th_nb_pipe</*test number*/ 11>(Queue);
+  Result &= test_array_th_bl_pipe</*test number*/ 12>(Queue);
+
+  return Result;
+}
diff --git a/SYCL/Basic/fpga_tests/fpga_pipes_legacy_ns.cpp b/SYCL/Basic/fpga_tests/fpga_pipes_legacy_ns.cpp
new file mode 100644
index 0000000000..254103fdf5
--- /dev/null
+++ b/SYCL/Basic/fpga_tests/fpga_pipes_legacy_ns.cpp
@@ -0,0 +1,63 @@
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// REQUIRES: accelerator
+// XFAIL:*
+//==-------- fpga_pipes_legacy_ns.cpp - SYCL FPGA pipes test ---------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include <CL/sycl.hpp>
+#include <iostream>
+
+class some_nb_pipe;
+
+// Test for simple non-blocking pipes in legacy namespace (cl::sycl::)
+template <typename PipeName>
+int test_simple_nb_pipe(cl::sycl::queue Queue) {
+  int data[] = {0};
+
+  using Pipe = cl::sycl::pipe<PipeName, int>;
+
+  cl::sycl::buffer<int, 1> readBuf(data, 1);
+  Queue.submit([&](cl::sycl::handler &cgh) {
+    cgh.single_task<class writer>([=]() {
+      bool SuccessCode = false;
+      do {
+        Pipe::write(42, SuccessCode);
+      } while (!SuccessCode);
+    });
+  });
+
+  cl::sycl::buffer<int, 1> writeBuf(data, 1);
+  Queue.submit([&](cl::sycl::handler &cgh) {
+    auto write_acc = writeBuf.get_access<cl::sycl::access::mode::write>(cgh);
+
+    cgh.single_task<class reader>([=]() {
+      bool SuccessCode = false;
+      do {
+        write_acc[0] = Pipe::read(SuccessCode);
+      } while (!SuccessCode);
+    });
+  });
+
+  auto readHostBuffer = writeBuf.get_access<cl::sycl::access::mode::read>();
+  if (readHostBuffer[0] != 42) {
+    std::cout <<"Result mismatches " << readHostBuffer[0] << " Vs expected "
+              << 42 << std::endl;
+
+    return -1;
+  }
+
+  return 0;
+}
+
+
+int main() {
+  cl::sycl::queue Queue;
+
+  // Non-blocking pipes
+  return test_simple_nb_pipe<some_nb_pipe>(Queue);
+}
diff --git a/SYCL/Basic/fpga_tests/fpga_queue.cpp b/SYCL/Basic/fpga_tests/fpga_queue.cpp
new file mode 100644
index 0000000000..01d2e3cf08
--- /dev/null
+++ b/SYCL/Basic/fpga_tests/fpga_queue.cpp
@@ -0,0 +1,168 @@
+// REQUIRES: opencl
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -L %opencl_libs_dir -lOpenCL
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+//==------------- fpga_queue.cpp - SYCL FPGA queues test -------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <set>
+
+using namespace cl::sycl;
+
+const int dataSize = 32;
+const int maxNumQueues = 256;
+
+void GetCLQueue(event sycl_event, std::set<cl_command_queue>& cl_queues) {
+  try {
+    cl_command_queue cl_queue;
+    cl_event cl_event = sycl_event.get();
+    cl_int error = clGetEventInfo(cl_event, CL_EVENT_COMMAND_QUEUE,
+                                  sizeof(cl_queue), &cl_queue, nullptr);
+    assert(CL_SUCCESS == error && "Failed to obtain queue from OpenCL event");
+
+    cl_queues.insert(cl_queue);
+  } catch (invalid_object_error e) {
+    std::cout << "Failed to get OpenCL queue from SYCL event: " << e.what()
+              << std::endl;
+  }
+}
+
+int getExpectedQueueNumber(cl_device_id device_id, int default_value) {
+   cl_command_queue_properties reportedProps;
+   cl_int iRet = clGetDeviceInfo(device_id,
+                                 CL_DEVICE_QUEUE_ON_HOST_PROPERTIES,
+                                 sizeof(reportedProps),
+                                 &reportedProps,
+                                 NULL);
+   assert(CL_SUCCESS == iRet && "Failed to obtain queue info from ocl device");
+   return (reportedProps & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)
+              ? 1 : default_value;
+}
+
+int main() {
+  int data[dataSize] = {0};
+
+  {
+    queue Queue;
+    std::set<cl_command_queue> cl_queues;
+    event sycl_event;
+
+    // Purpose of this test is to check how many OpenCL queues are being
+    // created from 1 SYCL queue for FPGA device. For that we submit 3 kernels
+    // expecting 3 OpenCL queues created as a result.
+    buffer<int, 1> bufA (data, range<1>(dataSize));
+    buffer<int, 1> bufB (data, range<1>(dataSize));
+    buffer<int, 1> bufC (data, range<1>(dataSize));
+
+    sycl_event = Queue.submit([&](handler& cgh) {
+      auto writeBuffer = bufA.get_access<access::mode::write>(cgh);
+
+      // Create a range.
+      auto myRange = range<1>(dataSize);
+
+      // Create a kernel.
+      auto myKernel = ([=](id<1> idx) {
+        writeBuffer[idx] = idx[0];
+      });
+
+      cgh.parallel_for<class fpga_writer_1>(myRange, myKernel);
+    });
+    GetCLQueue(sycl_event, cl_queues);
+
+    sycl_event = Queue.submit([&](handler& cgh) {
+      auto writeBuffer = bufB.get_access<access::mode::write>(cgh);
+
+      // Create a range.
+      auto myRange = range<1>(dataSize);
+
+      // Create a kernel.
+      auto myKernel = ([=](id<1> idx) {
+        writeBuffer[idx] = idx[0];
+      });
+
+      cgh.parallel_for<class fpga_writer_2>(myRange, myKernel);
+    });
+    GetCLQueue(sycl_event, cl_queues);
+
+    sycl_event = Queue.submit([&](handler& cgh) {
+      auto readBufferA = bufA.get_access<access::mode::read>(cgh);
+      auto readBufferB = bufB.get_access<access::mode::read>(cgh);
+      auto writeBuffer = bufC.get_access<access::mode::write>(cgh);
+
+      // Create a range.
+      auto myRange = range<1>(dataSize);
+
+      // Create a kernel.
+      auto myKernel = ([=](id<1> idx) {
+        writeBuffer[idx] = readBufferA[idx] + readBufferB[idx];
+      });
+
+      cgh.parallel_for<class fpga_calculator>(myRange, myKernel);
+    });
+    GetCLQueue(sycl_event, cl_queues);
+
+    int result = cl_queues.size();
+    device dev = Queue.get_device();
+    int expected_result = dev.is_host() ? 0 : getExpectedQueueNumber(dev.get(), 3);
+
+    if (expected_result != result) {
+      std::cout << "Result Num of queues = " << result << std::endl
+                << "Expected Num of queues = "<< expected_result << std::endl;
+
+      return -1;
+    }
+
+    auto readBufferC = bufC.get_access<access::mode::read>();
+    for (size_t i = 0; i != dataSize; ++i) {
+      if (readBufferC[i] != 2 * i) {
+        std::cout << "Result mismatches " << readBufferC[i] << " Vs expected "
+                  << 2 * i << " for index " << i << std::endl;
+      }
+    }
+  }
+
+  {
+    queue Queue;
+    std::set<cl_command_queue> cl_queues;
+    event sycl_event;
+
+    // Check limits of OpenCL queues creation for accelerator device.
+    buffer<int, 1> buf (&data[0], range<1>(1));
+
+    for (size_t i = 0; i != maxNumQueues + 1; ++i) {
+      sycl_event = Queue.submit([&](handler& cgh) {
+        auto Buffer = buf.get_access<access::mode::write>(cgh);
+
+        // Create a kernel.
+        auto myKernel = ([=]() {
+          Buffer[0] = 0;
+        });
+
+        cgh.single_task<class fpga_kernel>(myKernel);
+      });
+      GetCLQueue(sycl_event, cl_queues);
+    }
+
+    int result = cl_queues.size();
+    device dev = Queue.get_device();
+    int expected_result = dev.is_host() ? 0 : getExpectedQueueNumber(dev.get(), maxNumQueues);
+
+    if (expected_result != result) {
+      std::cout << "Result Num of queues = " << result << std::endl
+                << "Expected Num of queues = " << expected_result << std::endl;
+
+      return -1;
+    }
+  }
+
+  return 0;
+}
diff --git a/SYCL/Basic/fpga_tests/global_fpga_device_selector.cpp b/SYCL/Basic/fpga_tests/global_fpga_device_selector.cpp
new file mode 100644
index 0000000000..ee387a3710
--- /dev/null
+++ b/SYCL/Basic/fpga_tests/global_fpga_device_selector.cpp
@@ -0,0 +1,18 @@
+// REQUIRES: aoc, accelerator
+
+// RUN: %clangxx -fsycl -fintelfpga -std=c++17 %s -o %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// XFAIL:*
+
+#include <CL/sycl.hpp>
+#include <CL/sycl/intel/fpga_extensions.hpp>
+
+// Check that FPGA emulator device is found if we try to initialize inline global
+// variable using fpga_emulator_selector parameter.
+
+inline cl::sycl::queue fpga_emu_queue_inlined{
+    cl::sycl::intel::fpga_emulator_selector{}};
+
+int main() {
+  return 0;
+}
diff --git a/SYCL/Basic/fpga_tests/io_pipe_def.h b/SYCL/Basic/fpga_tests/io_pipe_def.h
new file mode 100644
index 0000000000..bbfa2f3a0a
--- /dev/null
+++ b/SYCL/Basic/fpga_tests/io_pipe_def.h
@@ -0,0 +1,12 @@
+#include <CL/sycl/intel/fpga_extensions.hpp>
+
+namespace intelfpga {
+template <unsigned ID> struct ethernet_pipe_id {
+  static constexpr unsigned id = ID;
+};
+
+using ethernet_read_pipe =
+    sycl::intel::kernel_readable_io_pipe<ethernet_pipe_id<0>, int, 0>;
+using ethernet_write_pipe =
+    sycl::intel::kernel_writeable_io_pipe<ethernet_pipe_id<1>, int, 0>;
+} // namespace intelfpga
diff --git a/SYCL/Basic/fpga_tests/pipes_info.cpp b/SYCL/Basic/fpga_tests/pipes_info.cpp
new file mode 100644
index 0000000000..58180c50bc
--- /dev/null
+++ b/SYCL/Basic/fpga_tests/pipes_info.cpp
@@ -0,0 +1,36 @@
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//==--------- pipes_info.cpp - SYCL device pipe info test --*- C++ -*-------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+int main() {
+  cl::sycl::queue Queue;
+  cl::sycl::device Device = Queue.get_device();
+  cl::sycl::platform Platform = Device.get_platform();
+
+  // Query if the device supports kernel to kernel pipe feature
+  bool IsSupported =
+    Device.get_info<cl::sycl::info::device::kernel_kernel_pipe_support>();
+
+  // Query for platform string. We expect only Intel FPGA platforms to support
+  // SYCL_INTEL_data_flow_pipes extension.
+  std::string platform_name =
+    Platform.get_info<cl::sycl::info::platform::name>();
+  bool SupposedToBeSupported =
+    (platform_name == "Intel(R) FPGA Emulation Platform for OpenCL(TM)" ||
+     platform_name == "Intel(R) FPGA SDK for OpenCL(TM)")
+        ? true
+        : false;
+
+  return (SupposedToBeSupported != IsSupported);
+}
diff --git a/SYCL/Basic/functor/kernel_functor.cpp b/SYCL/Basic/functor/kernel_functor.cpp
new file mode 100644
index 0000000000..995208c80a
--- /dev/null
+++ b/SYCL/Basic/functor/kernel_functor.cpp
@@ -0,0 +1,180 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -o %t.out %s
+// RUN: cd %T
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// REQUIRES: cpu, host, accelerator
+
+//==--- kernel_functor.cpp - Functors as SYCL kernel test ------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+#include <cassert>
+
+constexpr auto sycl_read_write = cl::sycl::access::mode::read_write;
+constexpr auto sycl_global_buffer = cl::sycl::access::target::global_buffer;
+
+// Case 1:
+// - functor class is defined in an anonymous namespace
+// - the '()' operator:
+//   * does not have parameters (to be used in 'single_task').
+//   * has no 'const' qualifier
+namespace {
+class Functor1 {
+public:
+  Functor1(
+      int X_,
+      cl::sycl::accessor<int, 1, sycl_read_write, sycl_global_buffer> &Acc_)
+      : X(X_), Acc(Acc_) {}
+
+  void operator()() { Acc[0] += X; }
+
+private:
+  int X;
+  cl::sycl::accessor<int, 1, sycl_read_write, sycl_global_buffer> Acc;
+};
+}
+
+// Case 2:
+// - functor class is defined in a namespace
+// - the '()' operator:
+//   * does not have parameters (to be used in 'single_task').
+//   * has the 'const' qualifier
+namespace ns {
+class Functor2 {
+public:
+  Functor2(
+      int X_,
+      cl::sycl::accessor<int, 1, sycl_read_write, sycl_global_buffer> &Acc_)
+      : X(X_), Acc(Acc_) {}
+
+  // cl::sycl::accessor's operator [] is const, hence 'const' is possible below
+  void operator()() const { Acc[0] += X; }
+
+private:
+  int X;
+  cl::sycl::accessor<int, 1, sycl_read_write, sycl_global_buffer> Acc;
+};
+}
+
+// Case 3:
+// - functor class is templated and defined in the translation unit scope
+// - the '()' operator:
+//   * has a parameter of type cl::sycl::id<1> (to be used in 'parallel_for').
+//   * has no 'const' qualifier
+template <typename T> class TmplFunctor {
+public:
+  TmplFunctor(
+      T X_, cl::sycl::accessor<T, 1, sycl_read_write, sycl_global_buffer> &Acc_)
+      : X(X_), Acc(Acc_) {}
+
+  void operator()(cl::sycl::id<1> id) { Acc[id] += X; }
+
+private:
+  T X;
+  cl::sycl::accessor<T, 1, sycl_read_write, sycl_global_buffer> Acc;
+};
+
+// Case 4:
+// - functor class is templated and defined in the translation unit scope
+// - the '()' operator:
+//   * has a parameter of type cl::sycl::id<1> (to be used in 'parallel_for').
+//   * has the 'const' qualifier
+template <typename T> class TmplConstFunctor {
+public:
+  TmplConstFunctor(
+      T X_, cl::sycl::accessor<T, 1, sycl_read_write, sycl_global_buffer> &Acc_)
+      : X(X_), Acc(Acc_) {}
+
+  void operator()(cl::sycl::id<1> id) const { Acc[id] += X; }
+
+private:
+  T X;
+  cl::sycl::accessor<T, 1, sycl_read_write, sycl_global_buffer> Acc;
+};
+
+// Exercise non-templated functors in 'single_task'.
+int foo(int X) {
+  int A[] = { 10 };
+  {
+    cl::sycl::queue Q;
+    cl::sycl::buffer<int, 1> Buf(A, 1);
+
+    Q.submit([&](cl::sycl::handler &cgh) {
+      auto Acc = Buf.get_access<sycl_read_write, sycl_global_buffer>(cgh);
+      Functor1 F(X, Acc);
+
+      cgh.single_task(F);
+    });
+    Q.submit([&](cl::sycl::handler &cgh) {
+      auto Acc = Buf.get_access<sycl_read_write, sycl_global_buffer>(cgh);
+      ns::Functor2 F(X, Acc);
+
+      cgh.single_task(F);
+    });
+    Q.submit([&](cl::sycl::handler &cgh) {
+      auto Acc = Buf.get_access<sycl_read_write, sycl_global_buffer>(cgh);
+      ns::Functor2 F(X, Acc);
+
+      cgh.single_task(F);
+    });
+  }
+  return A[0];
+}
+
+#define ARR_LEN(x) sizeof(x) / sizeof(x[0])
+
+// Exercise templated functors in 'parallel_for'.
+template <typename T> T bar(T X) {
+  T A[] = {(T)10, (T)10 };
+  {
+    cl::sycl::queue Q;
+    cl::sycl::buffer<T, 1> Buf(A, ARR_LEN(A));
+
+    Q.submit([&](cl::sycl::handler &cgh) {
+      auto Acc =
+          Buf.template get_access<sycl_read_write, sycl_global_buffer>(cgh);
+      TmplFunctor<T> F(X, Acc);
+
+      cgh.parallel_for(cl::sycl::range<1>(ARR_LEN(A)), F);
+    });
+    // Spice with lambdas to make sure functors and lambdas work together.
+    Q.submit([&](cl::sycl::handler &cgh) {
+      auto Acc =
+          Buf.template get_access<sycl_read_write, sycl_global_buffer>(cgh);
+      cgh.parallel_for<class LambdaKernel>(
+          cl::sycl::range<1>(ARR_LEN(A)),
+          [=](cl::sycl::id<1> id) { Acc[id] += X; });
+    });
+    Q.submit([&](cl::sycl::handler &cgh) {
+      auto Acc =
+          Buf.template get_access<sycl_read_write, sycl_global_buffer>(cgh);
+      TmplConstFunctor<T> F(X, Acc);
+
+      cgh.parallel_for(cl::sycl::range<1>(ARR_LEN(A)), F);
+    });
+  }
+  T res = (T)0;
+
+  for (int i = 0; i < ARR_LEN(A); i++)
+    res += A[i];
+  return res;
+}
+
+int main() {
+  const int Res1 = foo(10);
+  const int Res2 = bar(10);
+  const int Gold1 = 40;
+  const int Gold2 = 80;
+
+  assert(Res1 == Gold1);
+  assert(Res2 == Gold2);
+
+  return 0;
+}
diff --git a/SYCL/Basic/group-algorithm/all_of.cpp b/SYCL/Basic/group-algorithm/all_of.cpp
new file mode 100644
index 0000000000..be37442d32
--- /dev/null
+++ b/SYCL/Basic/group-algorithm/all_of.cpp
@@ -0,0 +1,77 @@
+// UNSUPPORTED: cuda
+// OpenCL C 2.x alike work-group functions not yet supported by CUDA.
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+using namespace sycl;
+using namespace sycl::intel;
+
+template <class Predicate>
+class all_of_kernel;
+
+struct GeZero {
+  bool operator()(int i) const { return i >= 0; }
+};
+struct IsEven {
+  bool operator()(int i) const { return (i % 2) == 0; }
+};
+struct LtZero {
+  bool operator()(int i) const { return i < 0; }
+};
+
+template <typename InputContainer, typename OutputContainer, class Predicate>
+void test(queue q, InputContainer input, OutputContainer output,
+          Predicate pred) {
+  typedef class all_of_kernel<Predicate> kernel_name;
+  size_t N = input.size();
+  size_t G = 16;
+  {
+    buffer<int> in_buf(input.data(), input.size());
+    buffer<bool> out_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto in = in_buf.get_access<access::mode::read>(cgh);
+      auto out = out_buf.get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<kernel_name>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        int lid = it.get_local_id(0);
+        out[0] = all_of(g, pred(in[lid]));
+        out[1] = all_of(g, in[lid], pred);
+        out[2] = all_of(g, in.get_pointer(), in.get_pointer() + N, pred);
+      });
+    });
+  }
+  bool expected = std::all_of(input.begin(), input.end(), pred);
+  assert(output[0] == expected);
+  assert(output[1] == expected);
+  assert(output[2] == expected);
+}
+
+int main() {
+  queue q;
+  std::string version = q.get_device().get_info<info::device::version>();
+  if (version < std::string("2.0")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 32;
+  std::array<int, N> input;
+  std::array<bool, 3> output;
+  std::iota(input.begin(), input.end(), 0);
+  std::fill(output.begin(), output.end(), false);
+
+  test(q, input, output, GeZero());
+  test(q, input, output, IsEven());
+  test(q, input, output, LtZero());
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/SYCL/Basic/group-algorithm/any_of.cpp b/SYCL/Basic/group-algorithm/any_of.cpp
new file mode 100644
index 0000000000..c9607e9159
--- /dev/null
+++ b/SYCL/Basic/group-algorithm/any_of.cpp
@@ -0,0 +1,79 @@
+// UNSUPPORTED: cuda
+// OpenCL C 2.x alike work-group functions not yet supported by CUDA.
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+using namespace sycl;
+using namespace sycl::intel;
+
+template <class Predicate>
+class any_of_kernel;
+
+struct GeZero {
+  bool operator()(int i) const { return i >= 0; }
+};
+struct IsEven {
+  bool operator()(int i) const { return (i % 2) == 0; }
+};
+struct LtZero {
+  bool operator()(int i) const { return i < 0; }
+};
+
+template <typename InputContainer, typename OutputContainer, class Predicate>
+void test(queue q, InputContainer input, OutputContainer output,
+          Predicate pred) {
+  typedef typename InputContainer::value_type InputT;
+  typedef typename OutputContainer::value_type OutputT;
+  typedef class any_of_kernel<Predicate> kernel_name;
+  size_t N = input.size();
+  size_t G = 16;
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto in = in_buf.template get_access<access::mode::read>(cgh);
+      auto out = out_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<kernel_name>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        int lid = it.get_local_id(0);
+        out[0] = any_of(g, pred(in[lid]));
+        out[1] = any_of(g, in[lid], pred);
+        out[2] = any_of(g, in.get_pointer(), in.get_pointer() + N, pred);
+      });
+    });
+  }
+  bool expected = std::any_of(input.begin(), input.end(), pred);
+  assert(output[0] == expected);
+  assert(output[1] == expected);
+  assert(output[2] == expected);
+}
+
+int main() {
+  queue q;
+  std::string version = q.get_device().get_info<info::device::version>();
+  if (version < std::string("2.0")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 32;
+  std::array<int, N> input;
+  std::array<bool, 3> output;
+  std::iota(input.begin(), input.end(), 0);
+  std::fill(output.begin(), output.end(), false);
+
+  test(q, input, output, GeZero());
+  test(q, input, output, IsEven());
+  test(q, input, output, LtZero());
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/SYCL/Basic/group-algorithm/broadcast.cpp b/SYCL/Basic/group-algorithm/broadcast.cpp
new file mode 100644
index 0000000000..387ae8430c
--- /dev/null
+++ b/SYCL/Basic/group-algorithm/broadcast.cpp
@@ -0,0 +1,65 @@
+// UNSUPPORTED: cuda
+// OpenCL C 2.x alike work-group functions not yet supported by CUDA.
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+using namespace sycl;
+using namespace sycl::intel;
+
+class broadcast_kernel;
+
+template <typename InputContainer, typename OutputContainer>
+void test(queue q, InputContainer input, OutputContainer output) {
+  typedef typename InputContainer::value_type InputT;
+  typedef typename OutputContainer::value_type OutputT;
+  typedef class broadcast_kernel kernel_name;
+  size_t N = input.size();
+  size_t G = 4;
+  range<2> R(G, G);
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto in = in_buf.template get_access<access::mode::read>(cgh);
+      auto out = out_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<kernel_name>(nd_range<2>(R, R), [=](nd_item<2> it) {
+        group<2> g = it.get_group();
+        int lid = it.get_local_linear_id();
+        out[0] = broadcast(g, in[lid]);
+        out[1] = broadcast(g, in[lid], group<2>::id_type(1, 2));
+        out[2] = broadcast(g, in[lid], group<2>::linear_id_type(2 * G + 1));
+      });
+    });
+  }
+  assert(output[0] == input[0]);
+  assert(output[1] == input[1 * G + 2]);
+  assert(output[2] == input[2 * G + 1]);
+}
+
+int main() {
+  queue q;
+  std::string version = q.get_device().get_info<info::device::version>();
+  if (version < std::string("2.0")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 16;
+  std::array<int, N> input;
+  std::array<int, N> output;
+  std::iota(input.begin(), input.end(), 1);
+  std::fill(output.begin(), output.end(), false);
+
+  test(q, input, output);
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/SYCL/Basic/group-algorithm/exclusive_scan.cpp b/SYCL/Basic/group-algorithm/exclusive_scan.cpp
new file mode 100644
index 0000000000..22d0644355
--- /dev/null
+++ b/SYCL/Basic/group-algorithm/exclusive_scan.cpp
@@ -0,0 +1,147 @@
+// UNSUPPORTED: cuda
+// OpenCL C 2.x alike work-group functions not yet supported by CUDA.
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <limits>
+#include <numeric>
+#include <vector>
+using namespace sycl;
+using namespace sycl::intel;
+
+template <class BinaryOperation, int TestNumber>
+class exclusive_scan_kernel;
+
+// std::exclusive_scan isn't implemented yet, so use serial implementation
+// instead
+namespace emu {
+template <typename InputIterator, typename OutputIterator,
+          class BinaryOperation, typename T>
+OutputIterator exclusive_scan(InputIterator first, InputIterator last,
+                              OutputIterator result, T init,
+                              BinaryOperation binary_op) {
+  T partial = init;
+  for (InputIterator it = first; it != last; ++it) {
+    *(result++) = partial;
+    partial = binary_op(partial, *it);
+  }
+  return result;
+}
+} // namespace emu
+
+template <typename InputContainer, typename OutputContainer,
+          class BinaryOperation>
+void test(queue q, InputContainer input, OutputContainer output,
+          BinaryOperation binary_op,
+          typename OutputContainer::value_type identity) {
+  typedef typename InputContainer::value_type InputT;
+  typedef typename OutputContainer::value_type OutputT;
+  typedef class exclusive_scan_kernel<BinaryOperation, 0> kernel_name0;
+  typedef class exclusive_scan_kernel<BinaryOperation, 1> kernel_name1;
+  typedef class exclusive_scan_kernel<BinaryOperation, 2> kernel_name2;
+  typedef class exclusive_scan_kernel<BinaryOperation, 3> kernel_name3;
+  OutputT init = 42;
+  size_t N = input.size();
+  size_t G = 16;
+  std::vector<OutputT> expected(N);
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+      auto in = in_buf.template get_access<access::mode::read>(cgh);
+      auto out = out_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<kernel_name0>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        int lid = it.get_local_id(0);
+        out[lid] = exclusive_scan(g, in[lid], binary_op);
+      });
+    });
+  }
+  emu::exclusive_scan(input.begin(), input.begin() + G, expected.begin(),
+                      identity, binary_op);
+  assert(std::equal(output.begin(), output.begin() + G, expected.begin()));
+
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+      auto in = in_buf.template get_access<access::mode::read>(cgh);
+      auto out = out_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<kernel_name1>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        int lid = it.get_local_id(0);
+        out[lid] = exclusive_scan(g, in[lid], init, binary_op);
+      });
+    });
+  }
+  emu::exclusive_scan(input.begin(), input.begin() + G, expected.begin(), init,
+                      binary_op);
+  assert(std::equal(output.begin(), output.begin() + G, expected.begin()));
+
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+      auto in = in_buf.template get_access<access::mode::read>(cgh);
+      auto out = out_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<kernel_name2>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        exclusive_scan(g, in.get_pointer(), in.get_pointer() + N,
+                       out.get_pointer(), binary_op);
+      });
+    });
+  }
+  emu::exclusive_scan(input.begin(), input.begin() + N, expected.begin(),
+                      identity, binary_op);
+  assert(std::equal(output.begin(), output.begin() + N, expected.begin()));
+
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+      auto in = in_buf.template get_access<access::mode::read>(cgh);
+      auto out = out_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<kernel_name3>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        exclusive_scan(g, in.get_pointer(), in.get_pointer() + N,
+                       out.get_pointer(), init, binary_op);
+      });
+    });
+  }
+  emu::exclusive_scan(input.begin(), input.begin() + N, expected.begin(), init,
+                      binary_op);
+  assert(std::equal(output.begin(), output.begin() + N, expected.begin()));
+}
+
+int main() {
+  queue q;
+  std::string version = q.get_device().get_info<info::device::version>();
+  if (version < std::string("2.0")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 32;
+  std::array<int, N> input;
+  std::array<int, N> output;
+  std::iota(input.begin(), input.end(), 0);
+  std::fill(output.begin(), output.end(), 0);
+
+#if __cplusplus >= 201402L
+  test(q, input, output, plus<>(), 0);
+  test(q, input, output, minimum<>(), std::numeric_limits<int>::max());
+  test(q, input, output, maximum<>(), std::numeric_limits<int>::lowest());
+#endif
+  test(q, input, output, plus<int>(), 0);
+  test(q, input, output, minimum<int>(), std::numeric_limits<int>::max());
+  test(q, input, output, maximum<int>(), std::numeric_limits<int>::lowest());
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/SYCL/Basic/group-algorithm/inclusive_scan.cpp b/SYCL/Basic/group-algorithm/inclusive_scan.cpp
new file mode 100644
index 0000000000..edea0142ef
--- /dev/null
+++ b/SYCL/Basic/group-algorithm/inclusive_scan.cpp
@@ -0,0 +1,147 @@
+// UNSUPPORTED: cuda
+// OpenCL C 2.x alike work-group functions not yet supported by CUDA.
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <limits>
+#include <numeric>
+#include <vector>
+using namespace sycl;
+using namespace sycl::intel;
+
+template <class BinaryOperation, int TestNumber>
+class inclusive_scan_kernel;
+
+// std::inclusive_scan isn't implemented yet, so use serial implementation
+// instead
+namespace emu {
+template <typename InputIterator, typename OutputIterator,
+          class BinaryOperation, typename T>
+OutputIterator inclusive_scan(InputIterator first, InputIterator last,
+                              OutputIterator result, BinaryOperation binary_op,
+                              T init) {
+  T partial = init;
+  for (InputIterator it = first; it != last; ++it) {
+    partial = binary_op(partial, *it);
+    *(result++) = partial;
+  }
+  return result;
+}
+} // namespace emu
+
+template <typename InputContainer, typename OutputContainer,
+          class BinaryOperation>
+void test(queue q, InputContainer input, OutputContainer output,
+          BinaryOperation binary_op,
+          typename OutputContainer::value_type identity) {
+  typedef typename InputContainer::value_type InputT;
+  typedef typename OutputContainer::value_type OutputT;
+  typedef class inclusive_scan_kernel<BinaryOperation, 0> kernel_name0;
+  typedef class inclusive_scan_kernel<BinaryOperation, 1> kernel_name1;
+  typedef class inclusive_scan_kernel<BinaryOperation, 2> kernel_name2;
+  typedef class inclusive_scan_kernel<BinaryOperation, 3> kernel_name3;
+  OutputT init = 42;
+  size_t N = input.size();
+  size_t G = 16;
+  std::vector<OutputT> expected(N);
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+      auto in = in_buf.template get_access<access::mode::read>(cgh);
+      auto out = out_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<kernel_name0>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        int lid = it.get_local_id(0);
+        out[lid] = inclusive_scan(g, in[lid], binary_op);
+      });
+    });
+  }
+  emu::inclusive_scan(input.begin(), input.begin() + G, expected.begin(),
+                      binary_op, identity);
+  assert(std::equal(output.begin(), output.begin() + G, expected.begin()));
+
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+      auto in = in_buf.template get_access<access::mode::read>(cgh);
+      auto out = out_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<kernel_name1>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        int lid = it.get_local_id(0);
+        out[lid] = inclusive_scan(g, in[lid], binary_op, init);
+      });
+    });
+  }
+  emu::inclusive_scan(input.begin(), input.begin() + G, expected.begin(),
+                      binary_op, init);
+  assert(std::equal(output.begin(), output.begin() + G, expected.begin()));
+
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+      auto in = in_buf.template get_access<access::mode::read>(cgh);
+      auto out = out_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<kernel_name2>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        inclusive_scan(g, in.get_pointer(), in.get_pointer() + N,
+                       out.get_pointer(), binary_op);
+      });
+    });
+  }
+  emu::inclusive_scan(input.begin(), input.begin() + N, expected.begin(),
+                      binary_op, identity);
+  assert(std::equal(output.begin(), output.begin() + N, expected.begin()));
+
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+      auto in = in_buf.template get_access<access::mode::read>(cgh);
+      auto out = out_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<kernel_name3>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        inclusive_scan(g, in.get_pointer(), in.get_pointer() + N,
+                       out.get_pointer(), binary_op, init);
+      });
+    });
+  }
+  emu::inclusive_scan(input.begin(), input.begin() + N, expected.begin(),
+                      binary_op, init);
+  assert(std::equal(output.begin(), output.begin() + N, expected.begin()));
+}
+
+int main() {
+  queue q;
+  std::string version = q.get_device().get_info<info::device::version>();
+  if (version < std::string("2.0")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 32;
+  std::array<int, N> input;
+  std::array<int, N> output;
+  std::iota(input.begin(), input.end(), 0);
+  std::fill(output.begin(), output.end(), 0);
+
+#if __cplusplus >= 201402L
+  test(q, input, output, plus<>(), 0);
+  test(q, input, output, minimum<>(), std::numeric_limits<int>::max());
+  test(q, input, output, maximum<>(), std::numeric_limits<int>::lowest());
+#endif
+  test(q, input, output, plus<int>(), 0);
+  test(q, input, output, minimum<int>(), std::numeric_limits<int>::max());
+  test(q, input, output, maximum<int>(), std::numeric_limits<int>::lowest());
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/SYCL/Basic/group-algorithm/leader.cpp b/SYCL/Basic/group-algorithm/leader.cpp
new file mode 100644
index 0000000000..f6c645f610
--- /dev/null
+++ b/SYCL/Basic/group-algorithm/leader.cpp
@@ -0,0 +1,50 @@
+// UNSUPPORTED: cuda
+// OpenCL C 2.x alike work-group functions not yet supported by CUDA.
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <cassert>
+using namespace sycl;
+using namespace sycl::intel;
+
+class leader_kernel;
+
+void test(queue q) {
+  typedef class leader_kernel kernel_name;
+  int out = 0;
+  size_t G = 4;
+
+  range<2> R(G, G);
+  {
+    buffer<int> out_buf(&out, 1);
+
+    q.submit([&](handler &cgh) {
+      auto out = out_buf.template get_access<access::mode::read_write>(cgh);
+      cgh.parallel_for<kernel_name>(nd_range<2>(R, R), [=](nd_item<2> it) {
+        group<2> g = it.get_group();
+        if (leader(g)) {
+          out[0] += 1;
+        }
+      });
+    });
+  }
+  assert(out == 1);
+}
+
+int main() {
+  queue q;
+  std::string version = q.get_device().get_info<info::device::version>();
+  if (version < std::string("2.0")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  test(q);
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/SYCL/Basic/group-algorithm/none_of.cpp b/SYCL/Basic/group-algorithm/none_of.cpp
new file mode 100644
index 0000000000..51a68ab9c7
--- /dev/null
+++ b/SYCL/Basic/group-algorithm/none_of.cpp
@@ -0,0 +1,77 @@
+// UNSUPPORTED: cuda
+// OpenCL C 2.x alike work-group functions not yet supported by CUDA.
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+using namespace sycl;
+using namespace sycl::intel;
+
+template <class Predicate>
+class none_of_kernel;
+
+struct GeZero {
+  bool operator()(int i) const { return i >= 0; }
+};
+struct IsEven {
+  bool operator()(int i) const { return (i % 2) == 0; }
+};
+struct LtZero {
+  bool operator()(int i) const { return i < 0; }
+};
+
+template <typename InputContainer, typename OutputContainer, class Predicate>
+void test(queue q, InputContainer input, OutputContainer output,
+          Predicate pred) {
+  typedef class none_of_kernel<Predicate> kernel_name;
+  size_t N = input.size();
+  size_t G = 16;
+  {
+    buffer<int> in_buf(input.data(), input.size());
+    buffer<bool> out_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto in = in_buf.get_access<access::mode::read>(cgh);
+      auto out = out_buf.get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<kernel_name>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        int lid = it.get_local_id(0);
+        out[0] = none_of(g, pred(in[lid]));
+        out[1] = none_of(g, in[lid], pred);
+        out[2] = none_of(g, in.get_pointer(), in.get_pointer() + N, pred);
+      });
+    });
+  }
+  bool expected = std::none_of(input.begin(), input.end(), pred);
+  assert(output[0] == expected);
+  assert(output[1] == expected);
+  assert(output[2] == expected);
+}
+
+int main() {
+  queue q;
+  std::string version = q.get_device().get_info<info::device::version>();
+  if (version < std::string("2.0")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 32;
+  std::array<int, N> input;
+  std::array<bool, 3> output;
+  std::iota(input.begin(), input.end(), 0);
+  std::fill(output.begin(), output.end(), false);
+
+  test(q, input, output, GeZero());
+  test(q, input, output, IsEven());
+  test(q, input, output, LtZero());
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/SYCL/Basic/group-algorithm/reduce.cpp b/SYCL/Basic/group-algorithm/reduce.cpp
new file mode 100644
index 0000000000..10a458b019
--- /dev/null
+++ b/SYCL/Basic/group-algorithm/reduce.cpp
@@ -0,0 +1,85 @@
+// UNSUPPORTED: cuda
+// OpenCL C 2.x alike work-group functions not yet supported by CUDA.
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <limits>
+#include <numeric>
+using namespace sycl;
+using namespace sycl::intel;
+
+template <class BinaryOperation>
+class reduce_kernel;
+
+template <typename InputContainer, typename OutputContainer,
+          class BinaryOperation>
+void test(queue q, InputContainer input, OutputContainer output,
+          BinaryOperation binary_op,
+          typename OutputContainer::value_type identity) {
+  typedef typename InputContainer::value_type InputT;
+  typedef typename OutputContainer::value_type OutputT;
+  typedef class reduce_kernel<BinaryOperation> kernel_name;
+  OutputT init = 42;
+  size_t N = input.size();
+  size_t G = 16;
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto in = in_buf.template get_access<access::mode::read>(cgh);
+      auto out = out_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<kernel_name>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        int lid = it.get_local_id(0);
+        out[0] = reduce(g, in[lid], binary_op);
+        out[1] = reduce(g, in[lid], init, binary_op);
+        out[2] = reduce(g, in.get_pointer(), in.get_pointer() + N, binary_op);
+        out[3] =
+            reduce(g, in.get_pointer(), in.get_pointer() + N, init, binary_op);
+      });
+    });
+  }
+  // std::reduce is not implemented yet, so use std::accumulate instead
+  assert(output[0] == std::accumulate(input.begin(), input.begin() + G,
+                                      identity, binary_op));
+  assert(output[1] ==
+         std::accumulate(input.begin(), input.begin() + G, init, binary_op));
+  assert(output[2] ==
+         std::accumulate(input.begin(), input.end(), identity, binary_op));
+  assert(output[3] ==
+         std::accumulate(input.begin(), input.end(), init, binary_op));
+}
+
+int main() {
+  queue q;
+  std::string version = q.get_device().get_info<info::device::version>();
+  if (version < std::string("2.0")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 32;
+  std::array<int, N> input;
+  std::array<int, 4> output;
+  std::iota(input.begin(), input.end(), 0);
+  std::fill(output.begin(), output.end(), 0);
+
+#if __cplusplus >= 201402L
+  test(q, input, output, plus<>(), 0);
+  test(q, input, output, minimum<>(), std::numeric_limits<int>::max());
+  test(q, input, output, maximum<>(), std::numeric_limits<int>::lowest());
+#endif
+  test(q, input, output, plus<int>(), 0);
+  test(q, input, output, minimum<int>(), std::numeric_limits<int>::max());
+  test(q, input, output, maximum<int>(), std::numeric_limits<int>::lowest());
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/SYCL/Basic/helpers.hpp b/SYCL/Basic/helpers.hpp
new file mode 100644
index 0000000000..e5ca8f768f
--- /dev/null
+++ b/SYCL/Basic/helpers.hpp
@@ -0,0 +1,76 @@
+//==------------------- helpers.hpp -  test helpers ------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+
+using namespace cl;
+
+template <class VecT, int EndIdx = VecT::get_count(), int StartIdx = 0>
+class VecPrinter {
+public:
+  VecPrinter(const VecT &Vec) : MVec(Vec) {}
+
+  void print(std::ostream &Out) const {
+    std::cout << "[ ";
+    printHelper<StartIdx>(Out, MVec);
+    std::cout << " ]";
+  }
+
+  static void print(const VecT &Elem1) {
+    std::cout << "[ ";
+    printHelper<StartIdx>(std::cout, Elem1);
+    std::cout << " ]";
+  }
+
+private:
+  template <int Idx>
+  static void printHelper(std::ostream &Out, const VecT &Elem1) {
+    std::cout << (typename VecT::element_type)(Elem1.template swizzle<Idx>());
+    if (Idx + 1 != EndIdx)
+      std::cout << ", ";
+    printHelper<Idx + 1>(Out, Elem1);
+  }
+  template <>
+  static void printHelper<EndIdx>(std::ostream &Out, const VecT &Elem1) {}
+
+  VecT MVec;
+};
+
+template <class VecT, int EndIdx = VecT::get_count(), int StartIdx = 0>
+VecPrinter<VecT, EndIdx, StartIdx> printableVec(const VecT &Vec) {
+  return VecPrinter<VecT, EndIdx, StartIdx>(Vec);
+}
+
+template <class VecT, int EndIdx, int StartIdx>
+std::ostream &operator<<(std::ostream &Out,
+                         const VecPrinter<VecT, EndIdx, StartIdx> &VecP) {
+  VecP.print(Out);
+  return Out;
+}
+
+class TestQueue : public sycl::queue {
+public:
+  TestQueue(const sycl::device_selector &DevSelector,
+            const sycl::property_list &PropList = {})
+      : sycl::queue(DevSelector,
+                    [](sycl::exception_list ExceptionList) {
+                      for (sycl::exception_ptr_class ExceptionPtr :
+                           ExceptionList) {
+                        try {
+                          std::rethrow_exception(ExceptionPtr);
+                        } catch (sycl::exception &E) {
+                          std::cerr << E.what() << std::endl;
+                        }
+                      }
+                      abort();
+                    },
+                    PropList) {}
+
+  ~TestQueue() { wait_and_throw(); }
+};
diff --git a/SYCL/Basic/host-interop-task/host-task-dependency.cpp b/SYCL/Basic/host-interop-task/host-task-dependency.cpp
new file mode 100644
index 0000000000..8432950101
--- /dev/null
+++ b/SYCL/Basic/host-interop-task/host-task-dependency.cpp
@@ -0,0 +1,200 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out %threads_lib
+// RUN: %CPU_RUN_PLACEHOLDER SYCL_PI_TRACE=-1 %t.out 2>&1 %CPU_CHECK_PLACEHOLDER
+// RUN: %GPU_RUN_PLACEHOLDER SYCL_PI_TRACE=-1 %t.out 2>&1 %GPU_CHECK_PLACEHOLDER
+// RUN: %ACC_RUN_PLACEHOLDER SYCL_PI_TRACE=-1 %t.out 2>&1 %ACC_CHECK_PLACEHOLDER
+//
+// TODO: Behaviour is unstable for level zero on Windows. Enable when fixed.
+// UNSUPPORTED: windows && level0
+// REQUIRES: cpu, gpu, accelerator
+
+#include <atomic>
+#include <condition_variable>
+#include <future>
+#include <mutex>
+#include <thread>
+
+#include <CL/sycl.hpp>
+
+namespace S = cl::sycl;
+
+struct Context {
+  std::atomic_bool Flag;
+  S::queue &Queue;
+  S::buffer<int, 1> Buf1;
+  S::buffer<int, 1> Buf2;
+  S::buffer<int, 1> Buf3;
+  std::mutex Mutex;
+  std::condition_variable CV;
+};
+
+void Thread1Fn(Context *Ctx) {
+  // 0. initialize resulting buffer with apriori wrong result
+  {
+    S::accessor<int, 1, S::access::mode::write,
+                S::access::target::host_buffer>
+        Acc(Ctx->Buf1);
+
+    for (size_t Idx = 0; Idx < Acc.get_count(); ++Idx)
+      Acc[Idx] = -1;
+  }
+
+  {
+    S::accessor<int, 1, S::access::mode::write,
+                S::access::target::host_buffer>
+        Acc(Ctx->Buf2);
+
+    for (size_t Idx = 0; Idx < Acc.get_count(); ++Idx)
+      Acc[Idx] = -2;
+  }
+
+  {
+    S::accessor<int, 1, S::access::mode::write,
+                S::access::target::host_buffer>
+        Acc(Ctx->Buf3);
+
+    for (size_t Idx = 0; Idx < Acc.get_count(); ++Idx)
+      Acc[Idx] = -3;
+  }
+
+  // 1. submit task writing to buffer 1
+  Ctx->Queue.submit([&](S::handler &CGH) {
+    S::accessor<int, 1, S::access::mode::write,
+                S::access::target::global_buffer>
+        GeneratorAcc(Ctx->Buf1, CGH);
+
+    auto GeneratorKernel = [GeneratorAcc] {
+      for (size_t Idx = 0; Idx < GeneratorAcc.get_count(); ++Idx)
+        GeneratorAcc[Idx] = Idx;
+    };
+
+    CGH.single_task<class GeneratorTask>(GeneratorKernel);
+  });
+
+  // 2. submit host task writing from buf 1 to buf 2
+  auto HostTaskEvent = Ctx->Queue.submit([&](S::handler &CGH) {
+    S::accessor<int, 1, S::access::mode::read,
+                S::access::target::host_buffer>
+        CopierSrcAcc(Ctx->Buf1, CGH);
+    S::accessor<int, 1, S::access::mode::write,
+                S::access::target::host_buffer>
+        CopierDstAcc(Ctx->Buf2, CGH);
+
+    auto CopierHostTask = [CopierSrcAcc, CopierDstAcc, &Ctx] {
+      for (size_t Idx = 0; Idx < CopierDstAcc.get_count(); ++Idx)
+        CopierDstAcc[Idx] = CopierSrcAcc[Idx];
+
+      bool Expected = false;
+      bool Desired = true;
+      assert(Ctx->Flag.compare_exchange_strong(Expected, Desired));
+
+      {
+        std::lock_guard<std::mutex> Lock(Ctx->Mutex);
+        Ctx->CV.notify_all();
+      }
+    };
+
+    CGH.codeplay_host_task(CopierHostTask);
+  });
+
+  // 3. submit simple task to move data between two buffers
+  Ctx->Queue.submit([&](S::handler &CGH) {
+    S::accessor<int, 1, S::access::mode::read,
+                S::access::target::global_buffer>
+        SrcAcc(Ctx->Buf2, CGH);
+    S::accessor<int, 1, S::access::mode::write,
+                S::access::target::global_buffer>
+        DstAcc(Ctx->Buf3, CGH);
+
+    CGH.depends_on(HostTaskEvent);
+
+    auto CopierKernel = [SrcAcc, DstAcc] {
+      for (size_t Idx = 0; Idx < DstAcc.get_count(); ++Idx)
+        DstAcc[Idx] = SrcAcc[Idx];
+    };
+
+    CGH.single_task<class CopierTask>(CopierKernel);
+  });
+
+  // 4. check data in buffer #3
+  {
+    S::accessor<int, 1, S::access::mode::read,
+                S::access::target::host_buffer>
+        Acc(Ctx->Buf3);
+
+    bool Failure = false;
+
+    for (size_t Idx = 0; Idx < Acc.get_count(); ++Idx) {
+      fprintf(stderr, "Third buffer [%3zu] = %i\n", Idx, Acc[Idx]);
+
+      Failure |= (Acc[Idx] != Idx);
+    }
+
+    assert(!Failure && "Invalid data in third buffer");
+  }
+}
+
+void Thread2Fn(Context *Ctx) {
+  std::unique_lock<std::mutex> Lock(Ctx->Mutex);
+
+  // T2.1. Wait until flag F is set eq true.
+  Ctx->CV.wait(Lock, [Ctx] { return Ctx->Flag.load(); });
+
+  assert(Ctx->Flag.load());
+}
+
+void test() {
+  auto EH = [](S::exception_list EL) {
+    for (const std::exception_ptr &E : EL) {
+      throw E;
+    }
+  };
+
+  S::queue Queue(EH);
+
+  Context Ctx{{false}, Queue, {10}, {10}, {10}, {}, {}};
+
+  // 0. setup: thread 1 T1: exec smth; thread 2 T2: waits; init flag F = false
+  auto A1 = std::async(std::launch::async, Thread1Fn, &Ctx);
+  auto A2 = std::async(std::launch::async, Thread2Fn, &Ctx);
+
+  A1.get();
+  A2.get();
+
+  assert(Ctx.Flag.load());
+
+  // 3. check via host accessor that buf 2 contains valid data
+  {
+    S::accessor<int, 1, S::access::mode::read,
+                S::access::target::host_buffer>
+        ResultAcc(Ctx.Buf2);
+
+    bool Failure = false;
+    for (size_t Idx = 0; Idx < ResultAcc.get_count(); ++Idx) {
+      fprintf(stderr, "Second buffer [%3zu] = %i\n", Idx, ResultAcc[Idx]);
+
+      Failure |= (ResultAcc[Idx] != Idx);
+    }
+
+    assert(!Failure && "Invalid data in result buffer");
+  }
+}
+
+int main() {
+  test();
+
+  return 0;
+}
+
+// launch of GeneratorTask kernel
+// CHECK:---> piKernelCreate(
+// CHECK: GeneratorTask
+// CHECK:---> piEnqueueKernelLaunch(
+// prepare for host task
+// CHECK:---> piEnqueueMemBufferMap(
+// launch of CopierTask kernel
+// CHECK:---> piKernelCreate(
+// CHECK: CopierTask
+// CHECK:---> piEnqueueKernelLaunch(
+// TODO need to check for piEventsWait as "wait on dependencies of host task".
+// At the same time this piEventsWait may occur anywhere after
+// piEnqueueMemBufferMap ("prepare for host task").
diff --git a/SYCL/Basic/host-interop-task/host-task-two-queues.cpp b/SYCL/Basic/host-interop-task/host-task-two-queues.cpp
new file mode 100644
index 0000000000..08c1aa313e
--- /dev/null
+++ b/SYCL/Basic/host-interop-task/host-task-two-queues.cpp
@@ -0,0 +1,82 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//
+// TODO: Flaky fail on Level Zero that is why mark as unsupported temporarily.
+// UNSUPPORTED: level0, opencl
+// REQUIRES: cpu, gpu, accelerator
+
+#include <CL/sycl.hpp>
+#include <vector>
+
+namespace S = cl::sycl;
+
+#define WIDTH 5
+#define HEIGHT 5
+
+void test() {
+  auto EH = [](S::exception_list EL) {
+    for (const std::exception_ptr &E : EL) {
+      throw E;
+    }
+  };
+
+  S::queue Q1(EH);
+  S::queue Q2(EH);
+
+  std::vector<int> DataA(WIDTH * HEIGHT, 2);
+  std::vector<int> DataB(WIDTH * HEIGHT, 3);
+  std::vector<int> DataC(WIDTH * HEIGHT, 1);
+
+  S::buffer<int, 2> BufA{DataA.data(), S::range<2>{WIDTH, HEIGHT}};
+  S::buffer<int, 2> BufB{DataB.data(), S::range<2>{WIDTH, HEIGHT}};
+  S::buffer<int, 2> BufC{DataC.data(), S::range<2>{WIDTH, HEIGHT}};
+
+  auto CG1 = [&](S::handler &CGH) {
+    auto AccA = BufA.get_access<S::access::mode::read>(CGH);
+    auto AccB = BufB.get_access<S::access::mode::read>(CGH);
+    auto AccC = BufC.get_access<S::access::mode::read_write>(CGH);
+    auto Kernel = [=](S::nd_item<2> Item) {
+      size_t W = Item.get_global_id(0);
+      size_t H = Item.get_global_id(1);
+      AccC[W][H] += AccA[W][H] * AccB[W][H];
+    };
+    CGH.parallel_for<class K1>(S::nd_range<2>({WIDTH, HEIGHT}, {1, 1}), Kernel);
+  };
+
+  auto CG2 = [&](S::handler &CGH) {
+    auto AccA = BufA.get_access<sycl::access::mode::read>(CGH);
+    auto AccB = BufB.get_access<sycl::access::mode::read>(CGH);
+    auto AccC = BufC.get_access<sycl::access::mode::read_write>(CGH);
+
+    CGH.codeplay_host_task([=] {
+      for (size_t I = 0; I < WIDTH; ++I)
+        for (size_t J = 0; J < HEIGHT; ++J) {
+          std::cout << "C[" << I << "][" << J << "] = " << AccC[I][J]
+                    << std::endl;
+        }
+    });
+  };
+
+  static const size_t NTIMES = 4;
+
+  for (size_t Idx = 0; Idx < NTIMES; ++Idx) {
+    Q1.submit(CG1);
+    Q2.submit(CG2);
+    Q2.submit(CG1);
+    Q1.submit(CG2);
+  }
+
+  Q1.wait_and_throw();
+  Q2.wait_and_throw();
+
+  for (size_t I = 0; I < WIDTH; ++I)
+    for (size_t J = 0; J < HEIGHT; ++J)
+      assert(DataC[I * HEIGHT + J] == (1 + 2 * 3 * NTIMES * 2));
+}
+
+int main(void) {
+  test();
+  return 0;
+}
diff --git a/SYCL/Basic/lit.cfg.py b/SYCL/Basic/lit.cfg.py
new file mode 100644
index 0000000000..968b64f77b
--- /dev/null
+++ b/SYCL/Basic/lit.cfg.py
@@ -0,0 +1,210 @@
+# -*- Python -*-
+
+import os
+import platform
+import re
+import subprocess
+import tempfile
+from distutils.spawn import find_executable
+
+import lit.formats
+import lit.util
+
+from lit.llvm import llvm_config
+
+# Configuration file for the 'lit' test runner.
+
+# name: The name of this test suite.
+config.name = 'SYCL'
+
+# testFormat: The test format to use to interpret tests.
+#
+# For now we require '&&' between commands, until they get globally killed and
+# the test runner updated.
+config.test_format = lit.formats.ShTest()
+
+# suffixes: A list of file extensions to treat as test files.
+config.suffixes = ['.c', '.cpp'] #add .spv. Currently not clear what to do with those
+
+config.excludes = ['Inputs']
+
+# test_source_root: The root path where tests are located.
+config.test_source_root = os.path.dirname(__file__)
+
+# test_exec_root: The root path where tests should be run.
+config.test_exec_root = os.path.join(config.sycl_obj_root, 'test')
+
+# Propagate some variables from the host environment.
+llvm_config.with_system_environment(['PATH', 'OCL_ICD_FILENAME', 'SYCL_DEVICE_ALLOWLIST', 'SYCL_CONFIG_FILE_NAME'])
+
+config.substitutions.append( ('%clang_cc1', ' ' +  config.dpcpp_compiler + ' -cc1 ') )
+config.substitutions.append( ('%clangxx', ' ' + config.dpcpp_compiler) )
+config.substitutions.append( ('%clang_cl', ' ' + config.dpcpp_compiler) )
+config.substitutions.append( ('%clang', ' ' + config.dpcpp_compiler) )
+config.substitutions.append( ('%threads_lib', config.sycl_threads_lib) )
+
+llvm_config.with_environment('PATH', config.lit_tools_dir, append_path=True)
+
+# Configure LD_LIBRARY_PATH or corresponding os-specific alternatives
+if platform.system() == "Linux":
+    config.available_features.add('linux')
+    llvm_config.with_system_environment('LD_LIBRARY_PATH')
+    llvm_config.with_environment('LD_LIBRARY_PATH', config.sycl_libs_dir, append_path=True)
+
+elif platform.system() == "Windows":
+    config.available_features.add('windows')
+    llvm_config.with_system_environment('LIB')
+    llvm_config.with_environment('LIB', config.sycl_libs_dir, append_path=True)
+
+elif platform.system() == "Darwin":
+    # FIXME: surely there is a more elegant way to instantiate the Xcode directories.
+    llvm_config.with_system_environment('CPATH')
+    llvm_config.with_environment('CPATH', "/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/include/c++/v1", append_path=True)
+    llvm_config.with_environment('CPATH', "/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include/", append_path=True)
+    llvm_config.with_environment('DYLD_LIBRARY_PATH', config.sycl_libs_dir)
+
+llvm_config.with_environment('PATH', config.sycl_tools_dir, append_path=True)
+
+config.substitutions.append( ('%sycl_libs_dir',  config.sycl_libs_dir ) )
+config.substitutions.append( ('%sycl_include',  config.sycl_include ) )
+#config.substitutions.append( ('%sycl_source_dir', config.sycl_source_dir) )
+config.substitutions.append( ('%opencl_libs_dir',  config.opencl_libs_dir) )
+config.substitutions.append( ('%opencl_include_dir',  config.opencl_include_dir) )
+#config.substitutions.append( ('%cuda_toolkit_include',  config.cuda_toolkit_include) )
+
+llvm_config.use_clang()
+
+llvm_config.add_tool_substitutions(['llvm-spirv'], [config.sycl_tools_dir])
+
+if not config.sycl_be:
+    config.sycl_be='PI_OPENCL'
+
+config.substitutions.append( ('%sycl_be', config.sycl_be) )
+lit_config.note("Backend: {BACKEND}".format(BACKEND=config.sycl_be))
+
+if config.dump_ir_supported:
+   config.available_features.add('dump_ir')
+
+cuda = False
+if ( config.sycl_be == "PI_OPENCL" and (
+        'cpu' in config.target_devices.split(',') or
+        'gpu' in config.target_devices.split(',') or
+        'acc' in config.target_devices.split(','))):
+    config.available_features.add('opencl')
+elif ( config.sycl_be == "PI_CUDA" ):
+    config.available_features.add('cuda')
+    cuda = True
+elif ( config.sycl_be == "PI_LEVEL0" ):
+    config.available_features.add('level0')
+
+# Configure device-specific substitutions based on availability of corresponding
+# devices/runtimes
+
+found_at_least_one_device = False
+
+host_run_substitute = "true"
+host_run_on_linux_substitute = "true "
+host_check_substitute = ""
+host_check_on_linux_substitute = ""
+
+if 'host' in config.target_devices.split(','):
+    found_at_least_one_device = True
+    lit_config.note("Test HOST device")
+    host_run_substitute = "env SYCL_DEVICE_TYPE=HOST SYCL_BE={SYCL_BE} ".format(SYCL_BE=config.sycl_be)
+    host_check_substitute = "| FileCheck %s"
+    config.available_features.add('host')
+    if platform.system() == "Linux":
+        host_run_on_linux_substitute = "env SYCL_DEVICE_TYPE=HOST SYCL_BE={SYCL_BE} ".format(SYCL_BE=config.sycl_be)
+        host_check_on_linux_substitute = "| FileCheck %s"
+else:
+    lit_config.warning("HOST device not used")
+
+config.substitutions.append( ('%HOST_RUN_PLACEHOLDER',  host_run_substitute) )
+config.substitutions.append( ('%HOST_RUN_ON_LINUX_PLACEHOLDER',  host_run_on_linux_substitute) )
+config.substitutions.append( ('%HOST_CHECK_PLACEHOLDER',  host_check_substitute) )
+config.substitutions.append( ('%HOST_CHECK_ON_LINUX_PLACEHOLDER',  host_check_on_linux_substitute) )
+
+cpu_run_substitute = "true"
+cpu_run_on_linux_substitute = "true "
+cpu_check_substitute = ""
+cpu_check_on_linux_substitute = ""
+
+if 'cpu' in config.target_devices.split(','):
+    found_at_least_one_device = True
+    lit_config.note("Test CPU device")
+    cpu_run_substitute = "env SYCL_DEVICE_TYPE=CPU SYCL_BE={SYCL_BE} ".format(SYCL_BE=config.sycl_be)
+    cpu_check_substitute = "| FileCheck %s"
+    config.available_features.add('cpu')
+    if platform.system() == "Linux":
+        cpu_run_on_linux_substitute = "env SYCL_DEVICE_TYPE=CPU SYCL_BE={SYCL_BE} ".format(SYCL_BE=config.sycl_be)
+        cpu_check_on_linux_substitute = "| FileCheck %s"
+else:
+    lit_config.warning("CPU device not used")
+
+config.substitutions.append( ('%CPU_RUN_PLACEHOLDER',  cpu_run_substitute) )
+config.substitutions.append( ('%CPU_RUN_ON_LINUX_PLACEHOLDER',  cpu_run_on_linux_substitute) )
+config.substitutions.append( ('%CPU_CHECK_PLACEHOLDER',  cpu_check_substitute) )
+config.substitutions.append( ('%CPU_CHECK_ON_LINUX_PLACEHOLDER',  cpu_check_on_linux_substitute) )
+
+gpu_run_substitute = "true"
+gpu_run_on_linux_substitute = "true "
+gpu_check_substitute = ""
+gpu_check_on_linux_substitute = ""
+
+if 'gpu' in config.target_devices.split(','):
+    found_at_least_one_device = True
+    lit_config.note("Test GPU device")
+    gpu_run_substitute = " env SYCL_DEVICE_TYPE=GPU SYCL_BE={SYCL_BE} ".format(SYCL_BE=config.sycl_be)
+    gpu_check_substitute = "| FileCheck %s"
+    config.available_features.add('gpu')
+
+    if platform.system() == "Linux":
+        gpu_run_on_linux_substitute = "env SYCL_DEVICE_TYPE=GPU SYCL_BE={SYCL_BE} ".format(SYCL_BE=config.sycl_be)
+        gpu_check_on_linux_substitute = "| FileCheck %s"
+else:
+    lit_config.warning("GPU device not used")
+
+config.substitutions.append( ('%GPU_RUN_PLACEHOLDER',  gpu_run_substitute) )
+config.substitutions.append( ('%GPU_RUN_ON_LINUX_PLACEHOLDER',  gpu_run_on_linux_substitute) )
+config.substitutions.append( ('%GPU_CHECK_PLACEHOLDER',  gpu_check_substitute) )
+config.substitutions.append( ('%GPU_CHECK_ON_LINUX_PLACEHOLDER',  gpu_check_on_linux_substitute) )
+
+acc_run_substitute = "true"
+acc_check_substitute = ""
+if 'acc' in config.target_devices.split(','):
+    found_at_least_one_device = True
+    lit_config.note("Tests accelerator device")
+    acc_run_substitute = " env SYCL_DEVICE_TYPE=ACC "
+    acc_check_substitute = "| FileCheck %s"
+    config.available_features.add('accelerator')
+else:
+    lit_config.warning("Accelerator device not used")
+config.substitutions.append( ('%ACC_RUN_PLACEHOLDER',  acc_run_substitute) )
+config.substitutions.append( ('%ACC_CHECK_PLACEHOLDER',  acc_check_substitute) )
+
+if cuda:
+    config.substitutions.append( ('%sycl_triple',  "nvptx64-nvidia-cuda-sycldevice" ) )
+else:
+    config.substitutions.append( ('%sycl_triple',  "spir64-unknown-linux-sycldevice" ) )
+
+if find_executable('sycl-ls'):
+    config.available_features.add('sycl-ls')
+
+# Device AOT compilation tools aren't part of the SYCL project,
+# so they need to be pre-installed on the machine
+aot_tools = ["ocloc", "aoc", "opencl-aot"]
+
+for aot_tool in aot_tools:
+    if find_executable(aot_tool) is not None:
+        lit_config.note("Found pre-installed AOT device compiler " + aot_tool)
+        config.available_features.add(aot_tool)
+    else:
+        lit_config.warning("Couldn't find pre-installed AOT device compiler " + aot_tool)
+
+# Set timeout for test 1 min
+try:
+    import psutil
+    lit_config.maxIndividualTestTime = 60
+except ImportError:
+    pass
+
diff --git a/SYCL/Basic/lit.site.cfg.py.in b/SYCL/Basic/lit.site.cfg.py.in
new file mode 100644
index 0000000000..e93c4e7386
--- /dev/null
+++ b/SYCL/Basic/lit.site.cfg.py.in
@@ -0,0 +1,29 @@
+@LIT_SITE_CFG_IN_HEADER@
+
+import sys
+import platform
+
+dpcpp_root_dir=os.path.dirname(os.path.dirname("@CMAKE_CXX_COMPILER@"))
+
+config.llvm_tools_dir = os.path.join(dpcpp_root_dir, 'bin')
+config.lit_tools_dir = os.path.dirname("@TEST_SUITE_LIT@")
+config.dump_ir_supported = "@DUMP_IR_SUPPORTED@" if "@DUMP_IR_SUPPORTED@" else False
+config.sycl_tools_dir = config.llvm_tools_dir
+config.sycl_include = os.path.join(dpcpp_root_dir, 'include', 'sycl')
+config.sycl_obj_root = "@CMAKE_CURRENT_BINARY_DIR@"
+#config.sycl_source_dir = "@SYCL_SOURCE_DIR@/source"
+config.sycl_libs_dir =  os.path.join(dpcpp_root_dir, ('bin' if platform.system() == "Windows" else 'lib'))
+config.target_triple = "x86_64-unknown-unknown-gnu"
+config.host_triple = "x86_64-unknown-unknown-gnu" 
+config.opencl_libs_dir = config.sycl_libs_dir
+config.opencl_include_dir = config.sycl_include
+config.target_devices = lit_config.params.get("target_devices", "@SYCL_TARGET_DEVICES@")
+config.sycl_be = lit_config.params.get("sycl_be", "@SYCL_BE@")
+config.sycl_threads_lib = '@SYCL_THREADS_LIB@'
+
+config.dpcpp_compiler = "@CMAKE_CXX_COMPILER@"
+
+import lit.llvm
+lit.llvm.initialize(lit_config, config)
+
+lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg.py")
diff --git a/SYCL/Basic/spec_const/spec_const_hw.cpp b/SYCL/Basic/spec_const/spec_const_hw.cpp
new file mode 100644
index 0000000000..251f862b40
--- /dev/null
+++ b/SYCL/Basic/spec_const/spec_const_hw.cpp
@@ -0,0 +1,121 @@
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda || level0
+//
+//==----------- spec_const_hw.cpp ------------------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// The test checks that the specialization constant feature works correctly -
+// tool chain processes them correctly and runtime can correctly execute the
+// program.
+
+#include <CL/sycl.hpp>
+
+#include <iostream>
+#include <vector>
+
+class MyInt32Const;
+class MyFloatConst;
+
+using namespace sycl;
+
+class KernelAAAi;
+class KernelBBBf;
+
+int val = 10;
+
+// Fetch a value at runtime.
+int get_value() { return val; }
+
+float foo(
+    const cl::sycl::experimental::spec_constant<float, MyFloatConst> &f32) {
+  return f32;
+}
+
+int main(int argc, char **argv) {
+  val = argc + 16;
+
+  cl::sycl::queue q(default_selector{}, [](exception_list l) {
+    for (auto ep : l) {
+      try {
+        std::rethrow_exception(ep);
+      } catch (cl::sycl::exception &e0) {
+        std::cout << e0.what();
+      } catch (std::exception &e1) {
+        std::cout << e1.what();
+      } catch (...) {
+        std::cout << "*** catch (...)\n";
+      }
+    }
+  });
+
+  std::cout << "Running on " << q.get_device().get_info<info::device::name>()
+            << "\n";
+  std::cout << "val = " << val << "\n";
+  cl::sycl::program program1(q.get_context());
+  cl::sycl::program program2(q.get_context());
+
+  int goldi = (int)get_value();
+  // TODO make this floating point once supported by the compiler
+  float goldf = (float)get_value();
+
+  cl::sycl::experimental::spec_constant<int32_t, MyInt32Const> i32 =
+      program1.set_spec_constant<MyInt32Const>(goldi);
+
+  cl::sycl::experimental::spec_constant<float, MyFloatConst> f32 =
+      program2.set_spec_constant<MyFloatConst>(goldf);
+
+  program1.build_with_kernel_type<KernelAAAi>();
+  // Use an option (does not matter which exactly) to test different internal
+  // SYCL RT execution path
+  program2.build_with_kernel_type<KernelBBBf>("-cl-fast-relaxed-math");
+
+  std::vector<int> veci(1);
+  std::vector<float> vecf(1);
+  try {
+    cl::sycl::buffer<int, 1> bufi(veci.data(), veci.size());
+    cl::sycl::buffer<float, 1> buff(vecf.data(), vecf.size());
+
+    q.submit([&](cl::sycl::handler &cgh) {
+      auto acci = bufi.get_access<cl::sycl::access::mode::write>(cgh);
+      cgh.single_task<KernelAAAi>(
+          program1.get_kernel<KernelAAAi>(),
+          [=]() {
+            acci[0] = i32.get();
+          });
+    });
+    q.submit([&](cl::sycl::handler &cgh) {
+      auto accf = buff.get_access<cl::sycl::access::mode::write>(cgh);
+      cgh.single_task<KernelBBBf>(
+          program2.get_kernel<KernelBBBf>(),
+          [=]() {
+            accf[0] = foo(f32);
+          });
+    });
+  } catch (cl::sycl::exception &e) {
+    std::cout << "*** Exception caught: " << e.what() << "\n";
+    return 1;
+  }
+  bool passed = true;
+  int vali = veci[0];
+
+  if (vali != goldi) {
+    std::cout << "*** ERROR: " << vali << " != " << goldi << "(gold)\n";
+    passed = false;
+  }
+  int valf = vecf[0];
+
+  if (valf != goldf) {
+    std::cout << "*** ERROR: " << valf << " != " << goldf << "(gold)\n";
+    passed = false;
+  }
+  std::cout << (passed ? "passed\n" : "FAILED\n");
+  return passed ? 0 : 1;
+}
diff --git a/SYCL/Basic/spec_const/spec_const_redefine.cpp b/SYCL/Basic/spec_const/spec_const_redefine.cpp
new file mode 100644
index 0000000000..075b33c70c
--- /dev/null
+++ b/SYCL/Basic/spec_const/spec_const_redefine.cpp
@@ -0,0 +1,112 @@
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: env SYCL_PI_TRACE=2 %CPU_RUN_PLACEHOLDER %t.out 2>&1 %CPU_CHECK_PLACEHOLDER
+// RUN: env SYCL_PI_TRACE=2 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER
+// RUN: env SYCL_PI_TRACE=2 %ACC_RUN_PLACEHOLDER %t.out 2>&1 %ACC_CHECK_PLACEHOLDER
+// UNSUPPORTED: cuda || level0 || host || accelerator
+//
+//==----------- spec_const_redefine.cpp ------------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// The test checks that:
+// - a specialization constant can be redifined and correct new value is used
+//   after redefinition.
+// - the program is JITted only once per a unique set of specialization
+//   constants values.
+
+#include <CL/sycl.hpp>
+
+#include <iostream>
+#include <vector>
+
+class SC0;
+class SC1;
+class KernelAAA;
+
+using namespace sycl;
+
+int val = 0;
+
+// Fetch a value at runtime.
+int get_value() { return val; }
+
+int main(int argc, char **argv) {
+  val = argc;
+
+  cl::sycl::queue q(default_selector{}, [](exception_list l) {
+    for (auto ep : l) {
+      try {
+        std::rethrow_exception(ep);
+      } catch (cl::sycl::exception &e0) {
+        std::cout << e0.what();
+      } catch (std::exception &e1) {
+        std::cout << e1.what();
+      } catch (...) {
+        std::cout << "*** catch (...)\n";
+      }
+    }
+  });
+
+  std::cout << "Running on " << q.get_device().get_info<info::device::name>()
+            << "\n";
+  bool passed = true;
+  int x = get_value();
+
+  const int sc_vals[][2] = {
+      {1 + x, 2 + x},
+      {2 + x, 3 + x},
+      {1 + x, 2 + x}, // same as first - program in cache must be used
+      {2 + x, 3 + x}  // same as second - program in cache must be used
+  };
+  constexpr int n_sc_sets = sizeof(sc_vals) / sizeof(sc_vals[0]);
+  std::vector<int> vec(n_sc_sets);
+
+  for (int i = 0; i < n_sc_sets; i++) {
+    cl::sycl::program program(q.get_context());
+    const int *sc_set = &sc_vals[i][0];
+    cl::sycl::experimental::spec_constant<int32_t, SC0> sc0 =
+        program.set_spec_constant<SC0>(sc_set[0]);
+    cl::sycl::experimental::spec_constant<int32_t, SC1> sc1 =
+        program.set_spec_constant<SC1>(sc_set[1]);
+
+    program.build_with_kernel_type<KernelAAA>();
+
+    try {
+      cl::sycl::buffer<int, 1> buf(vec.data(), vec.size());
+
+      q.submit([&](cl::sycl::handler &cgh) {
+        auto acc = buf.get_access<cl::sycl::access::mode::write>(cgh);
+        cgh.single_task<KernelAAA>(
+            program.get_kernel<KernelAAA>(),
+            [=]() {
+              acc[i] = sc0.get() + sc1.get();
+            });
+      });
+    } catch (cl::sycl::exception &e) {
+      std::cout << "*** Exception caught: " << e.what() << "\n";
+      return 1;
+    }
+    int val = vec[i];
+    int gold = sc_set[0] + sc_set[1];
+
+    std::cout << "val = " << val << " gold = " << gold << "\n";
+
+    if (val != gold) {
+      std::cout << "*** ERROR[" << i << "]: " << val << " != " << gold << "(gold)\n";
+      passed = false;
+    }
+  }
+  std::cout << (passed ? "passed\n" : "FAILED\n");
+  return passed ? 0 : 1;
+}
+
+// --- Check that only two JIT compilation happened:
+// CHECK-NOT: ---> piProgramLink
+// CHECK: ---> piProgramLink
+// CHECK: ---> piProgramLink
+// CHECK-NOT: ---> piProgramLink
+// --- Check that the test completed with expected results:
+// CHECK: passed
diff --git a/SYCL/Basic/struct_param/non-standard-layout.cpp b/SYCL/Basic/struct_param/non-standard-layout.cpp
new file mode 100644
index 0000000000..f5db9cc0f8
--- /dev/null
+++ b/SYCL/Basic/struct_param/non-standard-layout.cpp
@@ -0,0 +1,45 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+
+using namespace cl::sycl;
+
+struct F1 {};
+struct F2 {};
+struct F : F1, F2 {
+  cl::sycl::cl_char x;
+};
+
+bool test0() {
+  F S;
+  S.x = 0;
+  F S0;
+  S0.x = 1;
+  {
+    buffer<F, 1> Buf(&S0, range<1>(1));
+    queue myQueue;
+    myQueue.submit([&](handler &cgh) {
+      auto B = Buf.get_access<access::mode::write>(cgh);
+      cgh.single_task<class NonStandard>([=] { B[0] = S; });
+    });
+  }
+  bool Passed = (S.x == S0.x);
+
+  if (!Passed) {
+    std::cout << "test0 failed" << std::endl;
+  }
+  return Passed;
+}
+
+int main() {
+
+  bool Pass = test0();
+
+  std::cout << "Test " << (Pass ? "passed" : "FAILED") << std::endl;
+  return Pass ? 0 : 1;
+
+}
diff --git a/SYCL/Basic/struct_param/struct_kernel_param.cpp b/SYCL/Basic/struct_param/struct_kernel_param.cpp
new file mode 100644
index 0000000000..9ffe4724ce
--- /dev/null
+++ b/SYCL/Basic/struct_param/struct_kernel_param.cpp
@@ -0,0 +1,137 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+//==-struct_kernel_param.cpp-Checks passing structs as kernel params--------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cstring>
+#include <iostream>
+#include <iterator>
+
+using namespace cl::sycl;
+
+struct MyNestedStruct {
+  bool operator==(const MyNestedStruct &Rhs) {
+    return (FldArr[0] == Rhs.FldArr[0] && FldFloat == Rhs.FldFloat);
+  }
+  cl::sycl::cl_char FldArr[1];
+  cl::sycl::cl_float FldFloat;
+};
+
+struct MyStruct {
+  bool operator==(const MyStruct &Rhs) {
+    return (FldChar == Rhs.FldChar && FldLong == Rhs.FldLong &&
+            FldShort == Rhs.FldShort && FldUint == Rhs.FldUint &&
+            FldStruct == Rhs.FldStruct &&
+            std::equal(std::begin(FldArr), std::end(FldArr),
+                       std::begin(Rhs.FldArr)) &&
+            FldInt == Rhs.FldInt);
+  }
+  cl::sycl::cl_char FldChar;
+  cl::sycl::cl_long FldLong;
+  cl::sycl::cl_short FldShort;
+  cl::sycl::cl_uint FldUint;
+  MyNestedStruct FldStruct;
+  cl::sycl::cl_short FldArr[3];
+  cl::sycl::cl_int FldInt;
+};
+
+MyStruct GlobS;
+
+static void printStruct(const MyStruct &S0) {
+  std::cout << "{ " << (int)S0.FldChar << ", " << S0.FldLong << ", "
+            << S0.FldShort << ", " << S0.FldUint << " { { "
+            << (int)S0.FldStruct.FldArr[0] << " }, " << S0.FldStruct.FldFloat
+            << " }, { " << S0.FldArr[0] << ", " << S0.FldArr[1] << ", "
+            << S0.FldArr[2] << " }, " << S0.FldInt << " }";
+}
+
+bool test0() {
+  MyStruct S = GlobS;
+  MyStruct S0 = {0};
+  {
+    buffer<MyStruct, 1> Buf(&S0, range<1>(1));
+    queue myQueue;
+    myQueue.submit([&](handler &cgh) {
+      auto B = Buf.get_access<access::mode::write>(cgh);
+      cgh.single_task<class MyKernel>([=] { B[0] = S; });
+    });
+  }
+  bool Passed = (S == S0);
+
+  if (!Passed) {
+    std::cout << "test0 failed" << std::endl;
+    std::cout << "test0 input:" << std::endl;
+    printStruct(S);
+    std::cout << std::endl;
+    std::cout << "test0 result:\n";
+    printStruct(S0);
+    std::cout << std::endl;
+  }
+  return Passed;
+}
+
+bool test1() {
+  range<3> ice(8, 9, 10);
+  uint ice2 = 888;
+  uint result[4] = {0};
+
+  {
+    buffer<unsigned int, 1> Buffer((unsigned int *)result, range<1>(4));
+    queue myQueue;
+    myQueue.submit([&](handler &cgh) {
+      auto B = Buffer.get_access<access::mode::write>(cgh);
+      cgh.parallel_for<class bufferByRange_cap>(range<1>{4}, [=](id<1> index) {
+        B[index.get(0)] = index.get(0) > 2 ? ice2 : ice.get(index.get(0));
+      });
+    });
+  }
+
+  bool Passed = true;
+
+  for (unsigned long i = 0; i < 4; ++i) {
+    if (i <= 2) {
+      if (result[i] != ice[i])
+        Passed = false;
+    } else {
+      if (result[i] != ice2)
+        Passed = false;
+    }
+  }
+  if (!Passed)
+    std::cout << "test1 failed" << std::endl;
+
+  return Passed;
+}
+
+int main(int argc, char **argv) {
+  cl::sycl::cl_char PartChar = argc;
+  cl::sycl::cl_short PartShort = argc << 8;
+  cl::sycl::cl_int PartInt = argc << 16;
+  cl::sycl::cl_uint PartUint = argc << 16;
+  cl::sycl::cl_long PartLong = ((cl::sycl::cl_long)argc) << 32;
+  cl::sycl::cl_float PartFloat = argc;
+
+  GlobS = {PartChar,
+           PartLong,
+           PartShort,
+           PartUint,
+           {{PartChar}, PartFloat},
+           {PartShort, PartShort, PartShort},
+           PartInt};
+
+  bool Pass = test0() & test1();
+
+  std::cout << "Test " << (Pass ? "passed" : "FAILED") << std::endl;
+  return Pass ? 0 : 1;
+}
diff --git a/SYCL/Basic/sub_group/attributes.cpp b/SYCL/Basic/sub_group/attributes.cpp
new file mode 100644
index 0000000000..ac7e655532
--- /dev/null
+++ b/SYCL/Basic/sub_group/attributes.cpp
@@ -0,0 +1,125 @@
+// UNSUPPORTED: cuda
+// CUDA compilation and runtime do not yet support sub-groups.
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//==------- attributes.cpp - SYCL sub_group attributes test ----*- C++ -*---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "helper.hpp"
+
+#include <CL/sycl.hpp>
+
+#define KERNEL_FUNCTOR_WITH_SIZE(SIZE)                                         \
+  class KernelFunctor##SIZE {                                                  \
+  public:                                                                      \
+    [[cl::intel_reqd_sub_group_size(SIZE)]] void                               \
+    operator()(cl::sycl::nd_item<1> Item) {                                    \
+      const auto GID = Item.get_global_id();                                   \
+    }                                                                          \
+  };
+
+KERNEL_FUNCTOR_WITH_SIZE(1);
+KERNEL_FUNCTOR_WITH_SIZE(2);
+KERNEL_FUNCTOR_WITH_SIZE(4);
+KERNEL_FUNCTOR_WITH_SIZE(8);
+KERNEL_FUNCTOR_WITH_SIZE(16);
+
+#undef KERNEL_FUNCTOR_WITH_SIZE
+
+inline uint32_t flp2(uint32_t X) {
+  X = X | (X >> 1);
+  X = X | (X >> 2);
+  X = X | (X >> 4);
+  X = X | (X >> 8);
+  X = X | (X >> 16);
+  return X - (X >> 1);
+}
+
+template <typename Fn> inline void submit(cl::sycl::queue &Q) {
+  Q.submit([](cl::sycl::handler &cgh) {
+    Fn F;
+    cgh.parallel_for(cl::sycl::nd_range<1>{64, 16}, F);
+  });
+}
+
+int main() {
+  queue Queue;
+  device Device = Queue.get_device();
+
+  // According to specification, this kernel query requires `cl_khr_subgroups`
+  // or `cl_intel_subgroups`, and also `cl_intel_required_subgroup_size`
+  if ((!Device.has_extension("cl_intel_subgroups") &&
+       !Device.has_extension("cl_khr_subgroups")) ||
+      !Device.has_extension("cl_intel_required_subgroup_size")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  try {
+    const auto SGSizes = Device.get_info<info::device::sub_group_sizes>();
+
+    for (const auto SGSize : SGSizes) {
+      // Get the previous power of 2
+      auto ReqdSize = flp2(SGSize);
+
+      cl::sycl::program Prog(Queue.get_context());
+
+      // Store the `cl::sycl::kernel` into a vector because `cl::sycl::kernel`
+      // doesn't have default constructor
+      cl::sycl::vector_class<cl::sycl::kernel> TheKernel;
+
+      switch (ReqdSize) {
+      case 16:
+        Prog.build_with_kernel_type<KernelFunctor16>();
+        TheKernel.push_back(Prog.get_kernel<KernelFunctor16>());
+        submit<KernelFunctor16>(Queue);
+        break;
+      case 8:
+        Prog.build_with_kernel_type<KernelFunctor8>();
+        TheKernel.push_back(Prog.get_kernel<KernelFunctor8>());
+        submit<KernelFunctor8>(Queue);
+        break;
+      case 4:
+        Prog.build_with_kernel_type<KernelFunctor4>();
+        TheKernel.push_back(Prog.get_kernel<KernelFunctor4>());
+        submit<KernelFunctor4>(Queue);
+        break;
+      case 2:
+        Prog.build_with_kernel_type<KernelFunctor2>();
+        TheKernel.push_back(Prog.get_kernel<KernelFunctor2>());
+        submit<KernelFunctor2>(Queue);
+        break;
+      case 1:
+        Prog.build_with_kernel_type<KernelFunctor1>();
+        TheKernel.push_back(Prog.get_kernel<KernelFunctor1>());
+        submit<KernelFunctor1>(Queue);
+        break;
+      default:
+        throw feature_not_supported("sub-group size is not supported",
+                                    PI_INVALID_OPERATION);
+      }
+
+      auto Kernel = TheKernel[0];
+
+      auto Res = Kernel.get_sub_group_info<
+          cl::sycl::info::kernel_sub_group::compile_sub_group_size>(Device);
+
+      exit_if_not_equal<size_t>(Res, ReqdSize, "compile_sub_group_size");
+    }
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what();
+    return 1;
+  }
+
+  std::cout << "Test passed.\n";
+  return 0;
+}
diff --git a/SYCL/Basic/sub_group/barrier.cpp b/SYCL/Basic/sub_group/barrier.cpp
new file mode 100644
index 0000000000..cafe008512
--- /dev/null
+++ b/SYCL/Basic/sub_group/barrier.cpp
@@ -0,0 +1,90 @@
+// UNSUPPORTED: cuda
+// CUDA compilation and runtime do not yet support sub-groups.
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+//==---------- barrier.cpp - SYCL sub_group barrier test -------*- C++ -*---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "helper.hpp"
+#include <CL/sycl.hpp>
+#include <limits>
+#include <numeric>
+template <typename T> class sycl_subgr;
+using namespace cl::sycl;
+template <typename T> void check(queue &Queue, size_t G = 240, size_t L = 60) {
+  try {
+    nd_range<1> NdRange(G, L);
+    std::vector<T> data(G);
+    std::iota(data.begin(), data.end(), sizeof(T));
+    buffer<T> addbuf(data.data(), range<1>(G));
+    buffer<size_t> sgsizebuf(1);
+    Queue.submit([&](handler &cgh) {
+      auto addacc = addbuf.template get_access<access::mode::read_write>(cgh);
+      auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
+
+      cgh.parallel_for<sycl_subgr<T>>(NdRange, [=](nd_item<1> NdItem) {
+        intel::sub_group SG = NdItem.get_sub_group();
+        size_t lid = SG.get_local_id().get(0);
+        size_t gid = NdItem.get_global_id(0);
+        size_t SGoff = gid - lid;
+
+        T res = 0;
+        for (size_t i = 0; i <= lid; i++) {
+          res += addacc[SGoff + i];
+        }
+        SG.barrier(access::fence_space::global_space);
+        addacc[gid] = res;
+        if (NdItem.get_global_id(0) == 0)
+          sgsizeacc[0] = SG.get_max_local_range()[0];
+      });
+    });
+    auto addacc = addbuf.template get_access<access::mode::read_write>();
+    auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>();
+
+    size_t sg_size = sgsizeacc[0];
+    int WGid = -1, SGid = 0;
+    T add = 0;
+    for (int j = 0; j < G; j++) {
+      if (j % L % sg_size == 0) {
+        SGid++;
+        add = 0;
+      }
+      if (j % L == 0) {
+        WGid++;
+        SGid = 0;
+      }
+      add += j + sizeof(T);
+      exit_if_not_equal<T>(addacc[j], add, "barrier");
+    }
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what();
+    exit(1);
+  }
+}
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  check<int>(Queue);
+  check<unsigned int>(Queue);
+  check<long>(Queue);
+  check<unsigned long>(Queue);
+  check<float>(Queue);
+  if (Queue.get_device().has_extension("cl_khr_fp64")) {
+    check<double>(Queue);
+  }
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
diff --git a/SYCL/Basic/sub_group/broadcast.cpp b/SYCL/Basic/sub_group/broadcast.cpp
new file mode 100644
index 0000000000..fba93ee7a2
--- /dev/null
+++ b/SYCL/Basic/sub_group/broadcast.cpp
@@ -0,0 +1,87 @@
+// UNSUPPORTED: cuda
+// CUDA compilation and runtime do not yet support sub-groups.
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -D SG_GPU %s -o %t_gpu.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t_gpu.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+//==--------- broadcast.cpp - SYCL sub_group broadcast test ----*- C++ -*---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "helper.hpp"
+#include <CL/sycl.hpp>
+template <typename T>
+class sycl_subgr;
+using namespace cl::sycl;
+template <typename T>
+void check(queue &Queue) {
+  const int G = 240, L = 60;
+  try {
+    nd_range<1> NdRange(G, L);
+    buffer<T> syclbuf(G);
+    buffer<size_t> sgsizebuf(1);
+    Queue.submit([&](handler &cgh) {
+      auto syclacc = syclbuf.template get_access<access::mode::read_write>(cgh);
+      auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
+      cgh.parallel_for<sycl_subgr<T>>(NdRange, [=](nd_item<1> NdItem) {
+        intel::sub_group SG = NdItem.get_sub_group();
+        /*Broadcast GID of element with SGLID == SGID */
+        syclacc[NdItem.get_global_id()] =
+            broadcast(SG, T(NdItem.get_global_id(0)), SG.get_group_id());
+        if (NdItem.get_global_id(0) == 0)
+          sgsizeacc[0] = SG.get_max_local_range()[0];
+      });
+    });
+    auto syclacc = syclbuf.template get_access<access::mode::read_write>();
+    auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>();
+    size_t sg_size = sgsizeacc[0];
+    if (sg_size == 0)
+      sg_size = L;
+    int WGid = -1, SGid = 0;
+    for (int j = 0; j < G; j++) {
+      if (j % L % sg_size == 0) {
+        SGid++;
+      }
+      if (j % L == 0) {
+        WGid++;
+        SGid = 0;
+      }
+      exit_if_not_equal<T>(syclacc[j], L * WGid + SGid + SGid * sg_size,
+                           "broadcasted value");
+    }
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what();
+    exit(1);
+  }
+}
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  check<int>(Queue);
+  check<unsigned int>(Queue);
+  check<long>(Queue);
+  check<unsigned long>(Queue);
+  check<float>(Queue);
+  // broadcast half type is not supported in OCL CPU RT
+#ifdef SG_GPU
+  if (Queue.get_device().has_extension("cl_khr_fp16")) {
+    check<cl::sycl::half>(Queue);
+  }
+#endif
+  if (Queue.get_device().has_extension("cl_khr_fp64")) {
+    check<double>(Queue);
+  }
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
diff --git a/SYCL/Basic/sub_group/common.cpp b/SYCL/Basic/sub_group/common.cpp
new file mode 100644
index 0000000000..b9b526709c
--- /dev/null
+++ b/SYCL/Basic/sub_group/common.cpp
@@ -0,0 +1,93 @@
+// UNSUPPORTED: cuda
+// CUDA compilation and runtime do not yet support sub-groups.
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+//==-------------- common.cpp - SYCL sub_group common test -----*- C++ -*---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "helper.hpp"
+#include <CL/sycl.hpp>
+using namespace cl::sycl;
+struct Data {
+  unsigned int local_id;
+  unsigned int local_range;
+  unsigned int max_local_range;
+  unsigned int group_id;
+  unsigned int group_range;
+  unsigned int uniform_group_range;
+};
+
+void check(queue &Queue, unsigned int G, unsigned int L) {
+
+  try {
+    nd_range<1> NdRange(G, L);
+    buffer<struct Data, 1> syclbuf(G);
+
+    Queue.submit([&](handler &cgh) {
+      auto syclacc = syclbuf.get_access<access::mode::read_write>(cgh);
+      cgh.parallel_for<class sycl_subgr>(NdRange, [=](nd_item<1> NdItem) {
+        intel::sub_group SG = NdItem.get_sub_group();
+        syclacc[NdItem.get_global_id()].local_id = SG.get_local_id().get(0);
+        syclacc[NdItem.get_global_id()].local_range =
+            SG.get_local_range().get(0);
+        syclacc[NdItem.get_global_id()].max_local_range =
+            SG.get_max_local_range().get(0);
+        syclacc[NdItem.get_global_id()].group_id = SG.get_group_id().get(0);
+        syclacc[NdItem.get_global_id()].group_range = SG.get_group_range();
+        syclacc[NdItem.get_global_id()].uniform_group_range =
+            SG.get_uniform_group_range();
+      });
+    });
+    auto syclacc = syclbuf.get_access<access::mode::read_write>();
+    unsigned int max_sg = get_sg_size(Queue.get_device());
+    unsigned int num_sg = L / max_sg + (L % max_sg ? 1 : 0);
+    for (int j = 0; j < G; j++) {
+      unsigned int group_id = j % L / max_sg;
+      unsigned int local_range =
+          (group_id + 1 == num_sg) ? (L - group_id * max_sg) : max_sg;
+      exit_if_not_equal(syclacc[j].local_id, j % L % max_sg, "local_id");
+      exit_if_not_equal(syclacc[j].local_range, local_range, "local_range");
+      // TODO: Currently workgroup size affects this paramater on CPU and does
+      // not on GPU. Remove if when it is aligned.
+      if (Queue.get_device().get_info<info::device::device_type>() ==
+          info::device_type::cpu) {
+        exit_if_not_equal(syclacc[j].max_local_range, std::min(max_sg, L),
+                          "max_local_range");
+      } else {
+        exit_if_not_equal(syclacc[j].max_local_range, max_sg,
+                          "max_local_range");
+      }
+      exit_if_not_equal(syclacc[j].group_id, group_id, "group_id");
+      exit_if_not_equal(syclacc[j].group_range, num_sg, "group_range");
+      exit_if_not_equal(syclacc[j].uniform_group_range, num_sg,
+                        "uniform_group_range");
+    }
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what();
+    exit(1);
+  }
+}
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  check(Queue, 240, 80);
+  check(Queue, 8, 4);
+  check(Queue, 24, 12);
+  check(Queue, 1024, 256);
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
diff --git a/SYCL/Basic/sub_group/common_ocl.cpp b/SYCL/Basic/sub_group/common_ocl.cpp
new file mode 100644
index 0000000000..fd38c84969
--- /dev/null
+++ b/SYCL/Basic/sub_group/common_ocl.cpp
@@ -0,0 +1,111 @@
+// REQUIRES: opencl
+
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 %S/sg.cl -triple spir64-unknown-unknown -emit-llvm-bc -o %T/kernel_ocl.bc -include opencl-c.h
+// RUN: llvm-spirv %T/kernel_ocl.bc -o %T/kernel_ocl.spv
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -L %opencl_libs_dir -lOpenCL
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out %T/kernel_ocl.spv
+// RUN: %GPU_RUN_PLACEHOLDER %t.out %T/kernel_ocl.spv
+// RUN: %ACC_RUN_PLACEHOLDER %t.out %T/kernel_ocl.spv
+
+//==--- common_ocl.cpp - basic SG methods in SYCL vs OpenCL  ---*- C++ -*---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "helper.hpp"
+#include <CL/sycl.hpp>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+
+using namespace cl::sycl;
+struct Data {
+  unsigned int local_id;
+  unsigned int local_range;
+  unsigned int max_local_range;
+  unsigned int group_id;
+  unsigned int group_range;
+  unsigned int uniform_group_range;
+};
+
+void check(queue &Queue, const int G, const int L, const char *SpvFile) {
+  try {
+    nd_range<1> NdRange(G, L);
+    buffer<struct Data, 1> oclbuf(G);
+    buffer<struct Data, 1> syclbuf(G);
+
+    std::ifstream File(SpvFile, std::ios::binary);
+    if (!File.is_open()) {
+      std::cerr << std::strerror(errno);
+      throw compile_program_error("Cannot open SPIRV file\n", PI_INVALID_VALUE);
+    }
+    File.seekg(0, std::ios::end);
+    vector_class<char> Spv(File.tellg());
+    File.seekg(0);
+    File.read(Spv.data(), Spv.size());
+    File.close();
+    int Err;
+    cl_program ClProgram = clCreateProgramWithIL(Queue.get_context().get(),
+                                                 Spv.data(), Spv.size(), &Err);
+    CHECK_OCL_CODE(Err);
+    CHECK_OCL_CODE(
+        clBuildProgram(ClProgram, 0, nullptr, nullptr, nullptr, nullptr));
+    program Prog(Queue.get_context(), ClProgram);
+    Queue.submit([&](handler &cgh) {
+      auto oclacc = oclbuf.get_access<access::mode::read_write>(cgh);
+      cgh.set_args(oclacc);
+      cgh.parallel_for(NdRange, Prog.get_kernel("ocl_subgr"));
+    });
+    auto oclacc = oclbuf.get_access<access::mode::read_write>();
+
+    Queue.submit([&](handler &cgh) {
+      auto syclacc = syclbuf.get_access<access::mode::read_write>(cgh);
+      cgh.parallel_for<class sycl_subgr>(NdRange, [=](nd_item<1> NdItem) {
+        intel::sub_group SG = NdItem.get_sub_group();
+        syclacc[NdItem.get_global_id()].local_id = SG.get_local_id().get(0);
+        syclacc[NdItem.get_global_id()].local_range =
+            SG.get_local_range().get(0);
+        syclacc[NdItem.get_global_id()].max_local_range =
+            SG.get_max_local_range().get(0);
+        syclacc[NdItem.get_global_id()].group_id = SG.get_group_id().get(0);
+        syclacc[NdItem.get_global_id()].group_range = SG.get_group_range();
+        syclacc[NdItem.get_global_id()].uniform_group_range =
+            SG.get_uniform_group_range();
+      });
+    });
+    auto syclacc = syclbuf.get_access<access::mode::read_write>();
+    for (int j = 0; j < G; j++) {
+      exit_if_not_equal(syclacc[j].local_id, oclacc[j].local_id, "local_id");
+      exit_if_not_equal(syclacc[j].local_range, oclacc[j].local_range,
+                        "local_range");
+      exit_if_not_equal(syclacc[j].max_local_range, oclacc[j].max_local_range,
+                        "max_local_range");
+      exit_if_not_equal(syclacc[j].group_id, oclacc[j].group_id, "group_id");
+      exit_if_not_equal(syclacc[j].group_range, oclacc[j].group_range,
+                        "group_range");
+      exit_if_not_equal(syclacc[j].uniform_group_range,
+                        oclacc[j].uniform_group_range, "uniform_group_range");
+    }
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what();
+    exit(1);
+  }
+}
+int main(int argc, char **argv) {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device()) || argc != 2) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  check(Queue, 240, 80, argv[1]);
+  check(Queue, 8, 4, argv[1]);
+  check(Queue, 24, 12, argv[1]);
+  check(Queue, 1024, 256, argv[1]);
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
diff --git a/SYCL/Basic/sub_group/helper.hpp b/SYCL/Basic/sub_group/helper.hpp
new file mode 100644
index 0000000000..2476ed999d
--- /dev/null
+++ b/SYCL/Basic/sub_group/helper.hpp
@@ -0,0 +1,157 @@
+//==---------- helper.hpp - SYCL sub_group helper functions ----------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include <CL/sycl.hpp>
+#include <cmath>
+#include <iostream>
+
+using namespace cl::sycl;
+
+template <typename T1, int N> struct utils {
+  static T1 add_vec(const vec<T1, N> &v);
+  static bool cmp_vec(const vec<T1, N> &v, const vec<T1, N> &r);
+  static std::string stringify_vec(const vec<T1, N> &v);
+};
+template <typename T2> struct utils<T2, 1> {
+  static T2 add_vec(const vec<T2, 1> &v) { return v.s0(); }
+  static bool cmp_vec(const vec<T2, 1> &v, const vec<T2, 1> &r) {
+    return v.s0() == r.s0();
+  }
+  static std::string stringify_vec(const vec<T2, 1> &v) {
+    return std::to_string((T2)v.s0());
+  }
+};
+template <typename T2> struct utils<T2, 2> {
+  static T2 add_vec(const vec<T2, 2> &v) { return v.s0() + v.s1(); }
+  static bool cmp_vec(const vec<T2, 2> &v, const vec<T2, 2> &r) {
+    return v.s0() == r.s0() && v.s1() == r.s1();
+  }
+  static std::string stringify_vec(const vec<T2, 2> &v) {
+    return std::string("(") + std::to_string((T2)v.s0()) + ", " +
+           std::to_string((T2)v.s1()) + " )";
+  }
+};
+template <typename T2> struct utils<T2, 4> {
+  static T2 add_vec(const vec<T2, 4> &v) {
+    return v.s0() + v.s1() + v.s2() + v.s3();
+  }
+  static bool cmp_vec(const vec<T2, 4> &v, const vec<T2, 4> &r) {
+    return v.s0() == r.s0() && v.s1() == r.s1() && v.s2() == r.s2() &&
+           v.s3() == r.s3();
+  }
+  static std::string stringify_vec(const vec<T2, 4> &v) {
+    return std::string("(") + std::to_string((T2)v.s0()) + ", " +
+           std::to_string((T2)v.s1()) + std::to_string((T2)v.s2()) + ", " +
+           std::to_string((T2)v.s3()) + " )";
+  }
+};
+template <typename T2> struct utils<T2, 8> {
+  static T2 add_vec(const vec<T2, 8> &v) {
+    return v.s0() + v.s1() + v.s2() + v.s3() + v.s4() + v.s5() + v.s6() +
+           v.s7();
+  }
+  static bool cmp_vec(const vec<T2, 8> &v, const vec<T2, 8> &r) {
+    return v.s0() == r.s0() && v.s1() == r.s1() && v.s2() == r.s2() &&
+           v.s3() == r.s3() && v.s4() == r.s4() && v.s5() == r.s5() &&
+           v.s6() == r.s6() && v.s7() == r.s7();
+  }
+  static std::string stringify_vec(const vec<T2, 8> &v) {
+    return std::string("(") + std::to_string((T2)v.s0()) + ", " +
+           std::to_string((T2)v.s1()) + std::to_string((T2)v.s2()) + ", " +
+           std::to_string((T2)v.s3()) + std::to_string((T2)v.s4()) + ", " +
+           std::to_string((T2)v.s5()) + std::to_string((T2)v.s6()) + ", " +
+           std::to_string((T2)v.s7()) + " )";
+  }
+};
+
+template <typename T2> struct utils<T2, 16> {
+  static T2 add_vec(const vec<T2, 16> &v) {
+    return v.s0() + v.s1() + v.s2() + v.s3() + v.s4() + v.s5() + v.s6() +
+           v.s7() + v.s8() + v.s9() + v.sA() + v.sB() + v.sC() + v.sD() +
+           v.sE() + v.sF();
+  }
+  static bool cmp_vec(const vec<T2, 16> &v, const vec<T2, 16> &r) {
+    return v.s0() == r.s0() && v.s1() == r.s1() && v.s2() == r.s2() &&
+           v.s3() == r.s3() && v.s4() == r.s4() && v.s5() == r.s5() &&
+           v.s6() == r.s6() && v.s7() == r.s7() && v.s8() == r.s8() &&
+           v.s9() == r.s9() && v.sA() == r.sA() && v.sB() == r.sB() &&
+           v.sC() == r.sC() && v.sD() == r.sD() && v.sE() == r.sE() &&
+           v.sF() == r.sF();
+  }
+  static std::string stringify_vec(const vec<T2, 16> &v) {
+    return std::string("(") + std::to_string((T2)v.s0()) + ", " +
+           std::to_string((T2)v.s1()) + std::to_string((T2)v.s2()) + ", " +
+           std::to_string((T2)v.s3()) + std::to_string((T2)v.s4()) + ", " +
+           std::to_string((T2)v.s5()) + std::to_string((T2)v.s6()) + ", " +
+           std::to_string((T2)v.s7()) + std::to_string((T2)v.s8()) + ", " +
+           std::to_string((T2)v.s9()) + std::to_string((T2)v.sA()) + ", " +
+           std::to_string((T2)v.sB()) + std::to_string((T2)v.sC()) + ", " +
+           std::to_string((T2)v.sE()) + std::to_string((T2)v.sD()) + ", " +
+           std::to_string((T2)v.sF()) + " )";
+  }
+};
+
+template <typename T> void exit_if_not_equal(T val, T ref, const char *name) {
+  if (std::is_floating_point<T>::value) {
+    if (std::fabs(val - ref) > 0.01) {
+      std::cout << "Unexpected result for " << name << ": " << (double)val
+                << " expected value: " << (double)ref << std::endl;
+      exit(1);
+    }
+  } else {
+    if ((val - ref) != 0) {
+      std::cout << "Unexpected result for " << name << ": " << (long)val
+                << " expected value: " << (long)ref << std::endl;
+      exit(1);
+    }
+  }
+}
+
+template <> void exit_if_not_equal(half val, half ref, const char *name) {
+  int16_t cmp_val = reinterpret_cast<int16_t&>(val);
+  int16_t cmp_ref = reinterpret_cast<int16_t&>(ref);
+  if (std::abs(cmp_val - cmp_ref) > 1) {
+    std::cout << "Unexpected result for " << name << ": " << (float)val
+              << " expected value: " << (float)ref << std::endl;
+    exit(1);
+  }
+}
+
+template <typename T, int N>
+void exit_if_not_equal_vec(vec<T, N> val, vec<T, N> ref, const char *name) {
+  if (!utils<T, N>::cmp_vec(ref, val)) {
+    std::cout << "Unexpected result for " << name << ": "
+              << utils<T, N>::stringify_vec(val)
+              << " expected value: " << utils<T, N>::stringify_vec(ref)
+              << std::endl;
+
+    exit(1);
+  }
+}
+
+/* CPU returns max number of SG, GPU returns max SG size for
+ * CL_DEVICE_MAX_NUM_SUB_GROUPS device parameter. This function aligns the
+ * value.
+ * */
+inline size_t get_sg_size(const device &Device) {
+  size_t max_num_sg = Device.get_info<info::device::max_num_sub_groups>();
+  if (Device.get_info<info::device::device_type>() == info::device_type::cpu) {
+    size_t max_wg_size = Device.get_info<info::device::max_work_group_size>();
+    return max_wg_size / max_num_sg;
+  }
+  if (Device.get_info<info::device::device_type>() == info::device_type::gpu) {
+    return max_num_sg;
+  }
+  std::cout << "Unexpected deive type" << std::endl;
+  exit(1);
+}
+
+bool core_sg_supported(const device &Device) {
+  return (Device.has_extension("cl_khr_subgroups") ||
+          Device.get_info<info::device::version>().find(" 2.1") !=
+              string_class::npos);
+}
diff --git a/SYCL/Basic/sub_group/info.cpp b/SYCL/Basic/sub_group/info.cpp
new file mode 100644
index 0000000000..58fc06bd5e
--- /dev/null
+++ b/SYCL/Basic/sub_group/info.cpp
@@ -0,0 +1,93 @@
+// REQUIRES: opencl
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+//==------------- info.cpp - SYCL sub_group parameters test ----*- C++ -*---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "helper.hpp"
+#include <CL/sycl.hpp>
+class kernel_sg;
+using namespace cl::sycl;
+
+int main() {
+  queue Queue;
+  device Device = Queue.get_device();
+
+  /* Basic sub-group functionality is supported as part of cl_khr_subgroups
+   * extension or as core OpenCL 2.1 feature. */
+  if (!core_sg_supported(Device)) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  /* Check info::device parameters. */
+  Device.get_info<info::device::sub_group_independent_forward_progress>();
+  Device.get_info<info::device::max_num_sub_groups>();
+  /* sub_group_sizes can be quared only of cl_intel_required_subgroup_size
+   * extention is supported by device*/
+  if (Device.has_extension("cl_intel_required_subgroup_size"))
+    Device.get_info<info::device::sub_group_sizes>();
+
+  try {
+    size_t max_sg_num = get_sg_size(Device);
+    size_t max_wg_size = Device.get_info<info::device::max_work_group_size>();
+    program Prog(Queue.get_context());
+    /* TODO: replace with pure SYCL code when fixed problem with consumption
+     * kernels defined using program objects on GPU device
+    Prog.build_with_kernel_type<kernel_sg>();
+    kernel Kernel = Prog.get_kernel<kernel_sg>();
+
+    Queue.submit([&](cl::sycl::handler &cgh) {
+      cgh.parallel_for<kernel_sg>(
+          nd_range<2>(range<2>(50, 40), range<2>(10, 20)), Kernel,
+          [=](nd_item<2> index) {});
+    });*/
+    Prog.build_with_source("kernel void "
+                           "kernel_sg(global double* a, global double* b, "
+                           "global double* c) {*a=*b+*c; }\n");
+    kernel Kernel = Prog.get_kernel("kernel_sg");
+    uint32_t Res = 0;
+    for (auto r : {range<3>(3, 4, 5), range<3>(1, 1, 1), range<3>(4, 2, 1),
+                   range<3>(32, 3, 4), range<3>(7, 9, 11)}) {
+      Res = Kernel.get_sub_group_info<
+          info::kernel_sub_group::max_sub_group_size>(Device, r);
+      bool Expected = (Res == r.size() || Res == max_sg_num);
+      exit_if_not_equal<bool>(Expected, true,
+                              "max_sub_group_size");
+    }
+
+    Res = Kernel.get_sub_group_info<
+        info::kernel_sub_group::compile_num_sub_groups>(Device);
+
+    /* Sub-group size is not specified in kernel or IL*/
+    exit_if_not_equal<uint32_t>(Res, 0, "compile_num_sub_groups");
+
+    // According to specification, this kernel query requires `cl_khr_subgroups`
+    // or `cl_intel_subgroups`
+    if ((Device.has_extension("cl_khr_subgroups") ||
+         Device.has_extension("cl_intel_subgroups")) &&
+        Device.has_extension("cl_intel_required_subgroup_size")) {
+      Res = Kernel.get_sub_group_info<
+          info::kernel_sub_group::compile_sub_group_size>(Device);
+
+      /* Required sub-group size is not specified in kernel or IL*/
+      exit_if_not_equal<uint32_t>(Res, 0, "compile_sub_group_size");
+    }
+
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what();
+    return 1;
+  }
+
+  std::cout << "Test passed.\n";
+  return 0;
+}
diff --git a/SYCL/Basic/sub_group/load_store.cpp b/SYCL/Basic/sub_group/load_store.cpp
new file mode 100644
index 0000000000..109ae20336
--- /dev/null
+++ b/SYCL/Basic/sub_group/load_store.cpp
@@ -0,0 +1,205 @@
+// UNSUPPORTED: cuda
+// CUDA compilation and runtime do not yet support sub-groups.
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//
+//==----------- load_store.cpp - SYCL sub_group load/store test ------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "helper.hpp"
+#include <CL/sycl.hpp>
+template <typename T, int N> class sycl_subgr;
+
+using namespace cl::sycl;
+
+template <typename T, int N> void check(queue &Queue) {
+  const int G = 1024, L = 64;
+  try {
+    nd_range<1> NdRange(G, L);
+    buffer<T> syclbuf(G);
+    buffer<size_t> sgsizebuf(1);
+    {
+      auto acc = syclbuf.template get_access<access::mode::read_write>();
+      for (int i = 0; i < G; i++) {
+        acc[i] = i;
+        acc[i] += 0.1; // Check that floating point types are not casted to int
+      }
+    }
+    Queue.submit([&](handler &cgh) {
+      auto acc = syclbuf.template get_access<access::mode::read_write>(cgh);
+      auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
+      accessor<T, 1, access::mode::read_write, access::target::local> LocalMem(
+          {L}, cgh);
+      cgh.parallel_for<sycl_subgr<T, N>>(NdRange, [=](nd_item<1> NdItem) {
+        intel::sub_group SG = NdItem.get_sub_group();
+        if (SG.get_group_id().get(0) % N == 0) {
+          size_t SGOffset =
+              SG.get_group_id().get(0) * SG.get_max_local_range().get(0);
+          size_t WGSGoffset = NdItem.get_group(0) * L + SGOffset;
+          multi_ptr<T, access::address_space::global_space> mp(
+              &acc[WGSGoffset]);
+          multi_ptr<T, access::address_space::local_space> MPL(
+              &LocalMem[SGOffset]);
+          // Add all values in read block
+          vec<T, N> v(utils<T, N>::add_vec(SG.load<N, T>(mp)));
+          SG.store<N, T>(MPL, v);
+          vec<T, N> t(utils<T, N>::add_vec(SG.load<N, T>(MPL)));
+          SG.store<N, T>(mp, t);
+        }
+        if (NdItem.get_global_id(0) == 0)
+          sgsizeacc[0] = SG.get_max_local_range()[0];
+      });
+    });
+    auto acc = syclbuf.template get_access<access::mode::read_write>();
+    auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>();
+    size_t sg_size = sgsizeacc[0];
+    int WGid = -1, SGid = 0;
+    for (int j = 0; j < (G - (sg_size * N)); j++) {
+      if (j % L % sg_size == 0) {
+        SGid++;
+      }
+      if (j % L == 0) {
+        WGid++;
+        SGid = 0;
+      }
+      T ref = 0;
+      if (SGid % N) {
+        ref = acc[j - (SGid % N) * sg_size];
+      } else {
+        for (int i = 0; i < N; i++) {
+          ref += (T)(j + i * sg_size) + 0.1;
+        }
+        ref *= N;
+      }
+      /* There is no defined out-of-range behavior for these functions. */
+      if ((SGid + N) * sg_size < L) {
+        std::string s("Vector<");
+        s += std::string(typeid(ref).name()) + std::string(",") +
+             std::to_string(N) + std::string(">[") + std::to_string(j) +
+             std::string("]");
+        exit_if_not_equal<T>(acc[j], ref, s.c_str());
+      }
+    }
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what();
+    exit(1);
+  }
+}
+template <typename T> void check(queue &Queue) {
+  const int G = 128, L = 64;
+  try {
+    nd_range<1> NdRange(G, L);
+    buffer<T> syclbuf(G);
+    buffer<size_t> sgsizebuf(1);
+    {
+      auto acc = syclbuf.template get_access<access::mode::read_write>();
+      for (int i = 0; i < G; i++) {
+        acc[i] = i;
+        acc[i] += 0.1; // Check that floating point types are not casted to int
+      }
+    }
+
+    Queue.submit([&](handler &cgh) {
+      auto acc = syclbuf.template get_access<access::mode::read_write>(cgh);
+      auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
+      accessor<T, 1, access::mode::read_write, access::target::local> LocalMem(
+          {L}, cgh);
+      cgh.parallel_for<sycl_subgr<T, 0>>(NdRange, [=](nd_item<1> NdItem) {
+        intel::sub_group SG = NdItem.get_sub_group();
+        if (NdItem.get_global_id(0) == 0)
+          sgsizeacc[0] = SG.get_max_local_range()[0];
+        size_t SGOffset =
+            SG.get_group_id().get(0) * SG.get_max_local_range().get(0);
+        size_t WGSGoffset = NdItem.get_group(0) * L + SGOffset;
+        multi_ptr<T, access::address_space::global_space> mp(&acc[WGSGoffset]);
+        multi_ptr<T, access::address_space::local_space> MPL(
+            &LocalMem[SGOffset]);
+        T s = SG.load<T>(mp) + (T)SG.get_local_id().get(0);
+        SG.store<T>(MPL, s);
+        T t = SG.load<T>(MPL) + (T)SG.get_local_id().get(0);
+        SG.store<T>(mp, t);
+      });
+    });
+    auto acc = syclbuf.template get_access<access::mode::read_write>();
+    auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>();
+    size_t sg_size = sgsizeacc[0];
+    int WGid = -1, SGid = 0;
+    for (int j = 0; j < G; j++) {
+      if (j % L % sg_size == 0) {
+        SGid++;
+      }
+      if (j % L == 0) {
+        WGid++;
+        SGid = 0;
+      }
+      std::string s("Scalar<");
+      s += std::string(typeid(acc[j]).name()) + std::string(">[") +
+           std::to_string(j) + std::string("]");
+
+      exit_if_not_equal<T>(acc[j], (T)(j + 2 * (j % L % sg_size)) + 0.1,
+                           s.c_str());
+    }
+
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what();
+    exit(1);
+  }
+}
+
+int main() {
+  queue Queue;
+  if (!Queue.get_device().has_extension("cl_intel_subgroups") &&
+      !Queue.get_device().has_extension("cl_intel_subgroups_short")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  if (Queue.get_device().has_extension("cl_intel_subgroups")) {
+    typedef bool aligned_char __attribute__((aligned(16)));
+    check<aligned_char>(Queue);
+    typedef int aligned_int __attribute__((aligned(16)));
+    check<aligned_int>(Queue);
+    check<aligned_int, 1>(Queue);
+    check<aligned_int, 2>(Queue);
+    check<aligned_int, 4>(Queue);
+    check<aligned_int, 8>(Queue);
+    typedef unsigned int aligned_uint __attribute__((aligned(16)));
+    check<aligned_uint>(Queue);
+    check<aligned_uint, 1>(Queue);
+    check<aligned_uint, 2>(Queue);
+    check<aligned_uint, 4>(Queue);
+    check<aligned_uint, 8>(Queue);
+    typedef float aligned_float __attribute__((aligned(16)));
+    check<aligned_float>(Queue);
+    check<aligned_float, 1>(Queue);
+    check<aligned_float, 2>(Queue);
+    check<aligned_float, 4>(Queue);
+    check<aligned_float, 8>(Queue);
+  }
+  if (Queue.get_device().has_extension("cl_intel_subgroups_short")) {
+    typedef short aligned_short __attribute__((aligned(16)));
+    check<aligned_short>(Queue);
+    check<aligned_short, 1>(Queue);
+    check<aligned_short, 2>(Queue);
+    check<aligned_short, 4>(Queue);
+    check<aligned_short, 8>(Queue);
+    if (Queue.get_device().has_extension("cl_khr_fp16")) {
+      typedef half aligned_half __attribute__((aligned(16)));
+      check<aligned_half>(Queue);
+      check<aligned_half, 1>(Queue);
+      check<aligned_half, 2>(Queue);
+      check<aligned_half, 4>(Queue);
+      check<aligned_half, 8>(Queue);
+    }
+  }
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
diff --git a/SYCL/Basic/sub_group/reduce.cpp b/SYCL/Basic/sub_group/reduce.cpp
new file mode 100644
index 0000000000..03ac01362b
--- /dev/null
+++ b/SYCL/Basic/sub_group/reduce.cpp
@@ -0,0 +1,125 @@
+// UNSUPPORTED: cuda
+// CUDA compilation and runtime do not yet support sub-groups.
+//
+// RUN: %clangxx -fsycl -std=c++14 %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -std=c++14 -D SG_GPU %s -o %t_gpu.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t_gpu.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+//==--------------- reduce.cpp - SYCL sub_group reduce test ----*- C++ -*---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "helper.hpp"
+#include <CL/sycl.hpp>
+
+template <typename T, class BinaryOperation>
+class sycl_subgr;
+
+using namespace cl::sycl;
+
+template <typename T, class BinaryOperation>
+void check_op(queue &Queue, T init, BinaryOperation op, bool skip_init = false,
+              size_t G = 240, size_t L = 60) {
+  try {
+    nd_range<1> NdRange(G, L);
+    buffer<T> buf(G);
+    Queue.submit([&](handler &cgh) {
+      auto acc = buf.template get_access<access::mode::read_write>(cgh);
+      cgh.parallel_for<sycl_subgr<T, BinaryOperation>>(
+          NdRange, [=](nd_item<1> NdItem) {
+            intel::sub_group sg = NdItem.get_sub_group();
+            if (skip_init) {
+              acc[NdItem.get_global_id(0)] =
+                  reduce(sg, T(NdItem.get_global_id(0)), op);
+            } else {
+              acc[NdItem.get_global_id(0)] =
+                  reduce(sg, T(NdItem.get_global_id(0)), init, op);
+            }
+          });
+    });
+    auto acc = buf.template get_access<access::mode::read_write>();
+    size_t sg_size = get_sg_size(Queue.get_device());
+    int WGid = -1, SGid = 0;
+    T result = init;
+    for (int j = 0; j < G; j++) {
+      if (j % L % sg_size == 0) {
+        SGid++;
+        result = init;
+        for (int i = j; (i % L && i % L % sg_size) || (i == j); i++) {
+          result = op(result, T(i));
+        }
+      }
+      if (j % L == 0) {
+        WGid++;
+        SGid = 0;
+      }
+      std::string name =
+          std::string("reduce_") + typeid(BinaryOperation).name();
+      exit_if_not_equal<T>(acc[j], result, name.c_str());
+    }
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what();
+    exit(1);
+  }
+}
+
+template <typename T>
+void check(queue &Queue, size_t G = 240, size_t L = 60) {
+  // limit data range for half to avoid rounding issues
+  if (std::is_same<T, cl::sycl::half>::value) {
+    G = 64;
+    L = 32;
+  }
+
+  check_op<T>(Queue, T(L), intel::plus<T>(), false, G, L);
+  check_op<T>(Queue, T(0), intel::plus<T>(), true, G, L);
+
+  check_op<T>(Queue, T(0), intel::minimum<T>(), false, G, L);
+  check_op<T>(Queue, T(G), intel::minimum<T>(), true, G, L);
+
+  check_op<T>(Queue, T(G), intel::maximum<T>(), false, G, L);
+  check_op<T>(Queue, T(0), intel::maximum<T>(), true, G, L);
+
+#if __cplusplus >= 201402L
+  check_op<T>(Queue, T(L), intel::plus<>(), false, G, L);
+  check_op<T>(Queue, T(0), intel::plus<>(), true, G, L);
+
+  check_op<T>(Queue, T(0), intel::minimum<>(), false, G, L);
+  check_op<T>(Queue, T(G), intel::minimum<>(), true, G, L);
+
+  check_op<T>(Queue, T(G), intel::maximum<>(), false, G, L);
+  check_op<T>(Queue, T(0), intel::maximum<>(), true, G, L);
+#endif
+}
+
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  check<int>(Queue);
+  check<unsigned int>(Queue);
+  check<long>(Queue);
+  check<unsigned long>(Queue);
+  check<float>(Queue);
+  // reduce half type is not supported in OCL CPU RT
+#ifdef SG_GPU
+  if (Queue.get_device().has_extension("cl_khr_fp16")) {
+    check<cl::sycl::half>(Queue);
+  }
+#endif
+  if (Queue.get_device().has_extension("cl_khr_fp64")) {
+    check<double>(Queue);
+  }
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
diff --git a/SYCL/Basic/sub_group/scan.cpp b/SYCL/Basic/sub_group/scan.cpp
new file mode 100644
index 0000000000..70a5115cd4
--- /dev/null
+++ b/SYCL/Basic/sub_group/scan.cpp
@@ -0,0 +1,160 @@
+// UNSUPPORTED: cuda
+// CUDA compilation and runtime do not yet support sub-groups.
+//
+// RUN: %clangxx -fsycl -std=c++14 %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -std=c++14 -D SG_GPU %s -o %t_gpu.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t_gpu.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+//==--------------- scan.cpp - SYCL sub_group scan test --------*- C++ -*---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "helper.hpp"
+#include <CL/sycl.hpp>
+#include <limits>
+
+template <typename T, class BinaryOperation>
+class sycl_subgr;
+
+using namespace cl::sycl;
+
+template <typename T, class BinaryOperation>
+void check_op(queue &Queue, T init, BinaryOperation op, bool skip_init = false,
+              size_t G = 120, size_t L = 60) {
+  try {
+    nd_range<1> NdRange(G, L);
+    buffer<T> exbuf(G), inbuf(G);
+    Queue.submit([&](handler &cgh) {
+      auto exacc = exbuf.template get_access<access::mode::read_write>(cgh);
+      auto inacc = inbuf.template get_access<access::mode::read_write>(cgh);
+      cgh.parallel_for<sycl_subgr<T, BinaryOperation>>(
+          NdRange, [=](nd_item<1> NdItem) {
+            intel::sub_group sg = NdItem.get_sub_group();
+            if (skip_init) {
+              exacc[NdItem.get_global_id(0)] =
+                  exclusive_scan(sg, T(NdItem.get_global_id(0)), op);
+              inacc[NdItem.get_global_id(0)] =
+                  inclusive_scan(sg, T(NdItem.get_global_id(0)), op);
+            } else {
+              exacc[NdItem.get_global_id(0)] =
+                  exclusive_scan(sg, T(NdItem.get_global_id(0)), init, op);
+              inacc[NdItem.get_global_id(0)] =
+                  inclusive_scan(sg, T(NdItem.get_global_id(0)), op, init);
+            }
+          });
+    });
+    auto exacc = exbuf.template get_access<access::mode::read_write>();
+    auto inacc = inbuf.template get_access<access::mode::read_write>();
+    size_t sg_size = get_sg_size(Queue.get_device());
+    int WGid = -1, SGid = 0;
+    T result = init;
+    for (int j = 0; j < G; j++) {
+      if (j % L % sg_size == 0) {
+        SGid++;
+        result = init;
+      }
+      if (j % L == 0) {
+        WGid++;
+        SGid = 0;
+      }
+      std::string exname =
+          std::string("scan_exc_") + typeid(BinaryOperation).name();
+      std::string inname =
+          std::string("scan_inc_") + typeid(BinaryOperation).name();
+      exit_if_not_equal<T>(exacc[j], result, exname.c_str());
+      result = op(result, T(j));
+      exit_if_not_equal<T>(inacc[j], result, inname.c_str());
+    }
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what();
+    exit(1);
+  }
+}
+
+template <typename T>
+void check(queue &Queue, size_t G = 120, size_t L = 60) {
+  // limit data range for half to avoid rounding issues
+  if (std::is_same<T, cl::sycl::half>::value) {
+    G = 64;
+    L = 32;
+  }
+
+  check_op<T>(Queue, T(L), intel::plus<T>(), false, G, L);
+  check_op<T>(Queue, T(0), intel::plus<T>(), true, G, L);
+
+  check_op<T>(Queue, T(0), intel::minimum<T>(), false, G, L);
+  if (std::is_floating_point<T>::value ||
+      std::is_same<T, cl::sycl::half>::value) {
+    check_op<T>(Queue, std::numeric_limits<T>::infinity(), intel::minimum<T>(),
+                true, G, L);
+  } else {
+    check_op<T>(Queue, std::numeric_limits<T>::max(), intel::minimum<T>(), true,
+                G, L);
+  }
+
+  check_op<T>(Queue, T(G), intel::maximum<T>(), false, G, L);
+  if (std::is_floating_point<T>::value ||
+      std::is_same<T, cl::sycl::half>::value) {
+    check_op<T>(Queue, -std::numeric_limits<T>::infinity(), intel::maximum<T>(),
+                true, G, L);
+  } else {
+    check_op<T>(Queue, std::numeric_limits<T>::min(), intel::maximum<T>(), true,
+                G, L);
+  }
+
+#if __cplusplus >= 201402L
+  check_op<T>(Queue, T(L), intel::plus<>(), false, G, L);
+  check_op<T>(Queue, T(0), intel::plus<>(), true, G, L);
+
+  check_op<T>(Queue, T(0), intel::minimum<>(), false, G, L);
+  if (std::is_floating_point<T>::value ||
+      std::is_same<T, cl::sycl::half>::value) {
+    check_op<T>(Queue, std::numeric_limits<T>::infinity(), intel::minimum<>(),
+                true, G, L);
+  } else {
+    check_op<T>(Queue, std::numeric_limits<T>::max(), intel::minimum<>(), true,
+                G, L);
+  }
+
+  check_op<T>(Queue, T(G), intel::maximum<>(), false, G, L);
+  if (std::is_floating_point<T>::value ||
+      std::is_same<T, cl::sycl::half>::value) {
+    check_op<T>(Queue, -std::numeric_limits<T>::infinity(), intel::maximum<>(),
+                true, G, L);
+  } else {
+    check_op<T>(Queue, std::numeric_limits<T>::min(), intel::maximum<>(), true,
+                G, L);
+  }
+#endif
+}
+
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  check<int>(Queue);
+  check<unsigned int>(Queue);
+  check<long>(Queue);
+  check<unsigned long>(Queue);
+  check<float>(Queue);
+  // scan half type is not supported in OCL CPU RT
+#ifdef SG_GPU
+  if (Queue.get_device().has_extension("cl_khr_fp16")) {
+    check<cl::sycl::half>(Queue);
+  }
+#endif
+  if (Queue.get_device().has_extension("cl_khr_fp64")) {
+    check<double>(Queue);
+  }
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
diff --git a/SYCL/Basic/sub_group/sg.cl b/SYCL/Basic/sub_group/sg.cl
new file mode 100644
index 0000000000..0dcee41298
--- /dev/null
+++ b/SYCL/Basic/sub_group/sg.cl
@@ -0,0 +1,25 @@
+//==-------------- sg.cl - OpenCL reference kernel file --------*- C++ -*---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// ===--------------------------------------------------------------------=== //
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+struct Data {
+  uint local_id;
+  uint local_range;
+  uint max_local_range;
+  uint group_id;
+  uint group_range;
+  uint uniform_group_range;
+};
+__kernel void ocl_subgr(__global struct Data *a) {
+  uint id = get_global_id(0);
+  a[id].local_id = get_sub_group_local_id();
+  a[id].local_range = get_sub_group_size();
+  a[id].max_local_range = get_max_sub_group_size();
+  a[id].group_id = get_sub_group_id();
+  a[id].group_range = get_num_sub_groups();
+  a[id].uniform_group_range = get_num_sub_groups();
+}
diff --git a/SYCL/Basic/sub_group/shuffle.cpp b/SYCL/Basic/sub_group/shuffle.cpp
new file mode 100644
index 0000000000..bd7e11c89e
--- /dev/null
+++ b/SYCL/Basic/sub_group/shuffle.cpp
@@ -0,0 +1,265 @@
+// UNSUPPORTED: cuda
+// CUDA compilation and runtime do not yet support sub-groups.
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//
+//==------------ shuffle.cpp - SYCL sub_group shuffle test -----*- C++ -*---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "helper.hpp"
+#include <CL/sycl.hpp>
+template <typename T, int N> class sycl_subgr;
+
+using namespace cl::sycl;
+
+// TODO remove this workaround when clang will support correct generation of
+// half typename in integration header
+struct wa_half;
+
+template <typename T, int N>
+void check(queue &Queue, size_t G = 240, size_t L = 60) {
+  try {
+    nd_range<1> NdRange(G, L);
+    buffer<vec<T, N>> buf2(G);
+    buffer<vec<T, N>> buf2_up(G);
+    buffer<vec<T, N>> buf2_down(G);
+    buffer<vec<T, N>> buf(G);
+    buffer<vec<T, N>> buf_up(G);
+    buffer<vec<T, N>> buf_down(G);
+    buffer<vec<T, N>> buf_xor(G);
+    buffer<size_t> sgsizebuf(1);
+    Queue.submit([&](handler &cgh) {
+      auto acc2 = buf2.template get_access<access::mode::read_write>(cgh);
+      auto acc2_up = buf2_up.template get_access<access::mode::read_write>(cgh);
+      auto acc2_down =
+          buf2_down.template get_access<access::mode::read_write>(cgh);
+
+      auto acc = buf.template get_access<access::mode::read_write>(cgh);
+      auto acc_up = buf_up.template get_access<access::mode::read_write>(cgh);
+      auto acc_down =
+          buf_down.template get_access<access::mode::read_write>(cgh);
+      auto acc_xor = buf_xor.template get_access<access::mode::read_write>(cgh);
+      auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
+
+      cgh.parallel_for<sycl_subgr<T, N>>(NdRange, [=](nd_item<1> NdItem) {
+        intel::sub_group SG = NdItem.get_sub_group();
+        uint32_t wggid = NdItem.get_global_id(0);
+        uint32_t sgid = SG.get_group_id().get(0);
+        vec<T, N> vwggid(wggid), vsgid(sgid);
+        if (wggid == 0)
+          sgsizeacc[0] = SG.get_max_local_range()[0];
+        /* 1 for odd subgroups and 2 for even*/
+        acc2[NdItem.get_global_id()] =
+            SG.shuffle(vec<T, N>(1), vec<T, N>(2),
+                       (sgid % 2) ? 1 : SG.get_max_local_range()[0]);
+        /* GID-SGID */
+        acc2_up[NdItem.get_global_id()] = SG.shuffle_up(vwggid, vwggid, sgid);
+        /* GID-SGID or SGLID if GID+SGID > SGsize*/
+        acc2_down[NdItem.get_global_id()] =
+            SG.shuffle_down(vwggid, vec<T, N>(SG.get_local_id().get(0)), sgid);
+
+        /*GID of middle element in every subgroup*/
+        acc[NdItem.get_global_id()] =
+            SG.shuffle(vwggid, SG.get_max_local_range()[0] / 2);
+        /* Save GID-SGID */
+        acc_up[NdItem.get_global_id()] = SG.shuffle_up(vwggid, sgid);
+        /* Save GID+SGID */
+        acc_down[NdItem.get_global_id()] = SG.shuffle_down(vwggid, sgid);
+        /* Save GID XOR SGID */
+        acc_xor[NdItem.get_global_id()] = SG.shuffle_xor(vwggid, sgid);
+      });
+    });
+    auto acc = buf.template get_access<access::mode::read_write>();
+    auto acc_up = buf_up.template get_access<access::mode::read_write>();
+    auto acc_down = buf_down.template get_access<access::mode::read_write>();
+    auto acc2 = buf2.template get_access<access::mode::read_write>();
+    auto acc2_up = buf2_up.template get_access<access::mode::read_write>();
+    auto acc2_down = buf2_down.template get_access<access::mode::read_write>();
+    auto acc_xor = buf_xor.template get_access<access::mode::read_write>();
+    auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>();
+
+    size_t sg_size = sgsizeacc[0];
+    int SGid = 0;
+    for (int j = 0; j < G; j++) {
+      if (j % L % sg_size == 0) {
+        SGid++;
+      }
+      if (j % L == 0) {
+        SGid = 0;
+      }
+      /*GID of middle element in every subgroup*/
+      exit_if_not_equal_vec<T, N>(
+          acc[j], vec<T, N>(j / L * L + SGid * sg_size + sg_size / 2),
+          "shuffle");
+      /* 1 for odd subgroups and 2 for even*/
+      exit_if_not_equal_vec<T, N>(acc2[j], vec<T, N>((SGid % 2) ? 1 : 2),
+                                  "shuffle2");
+      /* Value GID+SGID for all element except last SGID in SG*/
+      if (j % L % sg_size + SGid < sg_size && j % L + SGid < L) {
+        exit_if_not_equal_vec(acc_down[j], vec<T, N>(j + SGid), "shuffle_down");
+        exit_if_not_equal_vec(acc2_down[j], vec<T, N>(j + SGid),
+                              "shuffle2_down");
+      } else {                /* SGLID for GID+SGid */
+        if (j % L + SGid < L) /* Do not go out  LG*/
+          exit_if_not_equal_vec<T, N>(acc2_down[j],
+                                      vec<T, N>((j + SGid) % L % sg_size),
+                                      "shuffle2_down");
+      }
+      /* Value GID-SGID for all element except first SGID in SG*/
+      if (j % L % sg_size >= SGid) {
+        exit_if_not_equal_vec(acc_up[j], vec<T, N>(j - SGid), "shuffle_up");
+        exit_if_not_equal_vec(acc2_up[j], vec<T, N>(j - SGid), "shuffle2_up");
+      } else {                          /* SGLID for GID-SGid */
+        if (j % L - SGid + sg_size < L) /* Do not go out  LG*/
+          exit_if_not_equal_vec(acc2_up[j], vec<T, N>(j - SGid + sg_size),
+                                "shuffle2_up");
+      }
+      /* GID XOR SGID */
+      exit_if_not_equal_vec(acc_xor[j], vec<T, N>(j ^ SGid), "shuffle_xor");
+    }
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what();
+    exit(1);
+  }
+}
+
+template <typename T> void check(queue &Queue, size_t G = 240, size_t L = 60) {
+  try {
+    nd_range<1> NdRange(G, L);
+    buffer<T> buf2(G);
+    buffer<T> buf2_up(G);
+    buffer<T> buf2_down(G);
+    buffer<T> buf(G);
+    buffer<T> buf_up(G);
+    buffer<T> buf_down(G);
+    buffer<T> buf_xor(G);
+    buffer<size_t> sgsizebuf(1);
+    Queue.submit([&](handler &cgh) {
+      auto acc2 = buf2.template get_access<access::mode::read_write>(cgh);
+      auto acc2_up = buf2_up.template get_access<access::mode::read_write>(cgh);
+      auto acc2_down =
+          buf2_down.template get_access<access::mode::read_write>(cgh);
+
+      auto acc = buf.template get_access<access::mode::read_write>(cgh);
+      auto acc_up = buf_up.template get_access<access::mode::read_write>(cgh);
+      auto acc_down =
+          buf_down.template get_access<access::mode::read_write>(cgh);
+      auto acc_xor = buf_xor.template get_access<access::mode::read_write>(cgh);
+      auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
+
+      cgh.parallel_for<sycl_subgr<T, 0>>(NdRange, [=](nd_item<1> NdItem) {
+        intel::sub_group SG = NdItem.get_sub_group();
+        uint32_t wggid = NdItem.get_global_id(0);
+        uint32_t sgid = SG.get_group_id().get(0);
+        if (wggid == 0)
+          sgsizeacc[0] = SG.get_max_local_range()[0];
+        /* 1 for odd subgroups and 2 for even*/
+        acc2[NdItem.get_global_id()] =
+            SG.shuffle<T>(1, 2, (sgid % 2) ? 1 : SG.get_max_local_range()[0]);
+        /* GID-SGID */
+        acc2_up[NdItem.get_global_id()] = SG.shuffle_up<T>(wggid, wggid, sgid);
+        /* GID-SGID or SGLID if GID+SGID > SGsize*/
+        acc2_down[NdItem.get_global_id()] =
+            SG.shuffle_down<T>(wggid, SG.get_local_id().get(0), sgid);
+
+        /*GID of middle element in every subgroup*/
+        acc[NdItem.get_global_id()] =
+            SG.shuffle<T>(wggid, SG.get_max_local_range()[0] / 2);
+        /* Save GID-SGID */
+        acc_up[NdItem.get_global_id()] = SG.shuffle_up<T>(wggid, sgid);
+        /* Save GID+SGID */
+        acc_down[NdItem.get_global_id()] = SG.shuffle_down<T>(wggid, sgid);
+        /* Save GID XOR SGID */
+        acc_xor[NdItem.get_global_id()] = SG.shuffle_xor<T>(wggid, sgid);
+      });
+    });
+    auto acc = buf.template get_access<access::mode::read_write>();
+    auto acc_up = buf_up.template get_access<access::mode::read_write>();
+    auto acc_down = buf_down.template get_access<access::mode::read_write>();
+    auto acc2 = buf2.template get_access<access::mode::read_write>();
+    auto acc2_up = buf2_up.template get_access<access::mode::read_write>();
+    auto acc2_down = buf2_down.template get_access<access::mode::read_write>();
+    auto acc_xor = buf_xor.template get_access<access::mode::read_write>();
+    auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>();
+
+    size_t sg_size = sgsizeacc[0];
+    int SGid = 0;
+    for (int j = 0; j < G; j++) {
+      if (j % L % sg_size == 0) {
+        SGid++;
+      }
+      if (j % L == 0) {
+        SGid = 0;
+      }
+      /*GID of middle element in every subgroup*/
+      exit_if_not_equal<T>(acc[j], j / L * L + SGid * sg_size + sg_size / 2,
+                           "shuffle");
+      /* 1 for odd subgroups and 2 for even*/
+      exit_if_not_equal<T>(acc2[j], (SGid % 2) ? 1 : 2, "shuffle2");
+      /* Value GID+SGID for all element except last SGID in SG*/
+      if (j % L % sg_size + SGid < sg_size && j % L + SGid < L) {
+        exit_if_not_equal<T>(acc_down[j], j + SGid, "shuffle_down");
+        exit_if_not_equal<T>(acc2_down[j], j + SGid, "shuffle2_down");
+      } else {                /* SGLID for GID+SGid */
+        if (j % L + SGid < L) /* Do not go out  LG*/
+          exit_if_not_equal<T>(acc2_down[j], (j + SGid) % L % sg_size,
+                               "shuffle2_down");
+      }
+      /* Value GID-SGID for all element except first SGID in SG*/
+      if (j % L % sg_size >= SGid) {
+        exit_if_not_equal<T>(acc_up[j], j - SGid, "shuffle_up");
+        exit_if_not_equal<T>(acc2_up[j], j - SGid, "shuffle2_up");
+      } else {                          /* SGLID for GID-SGid */
+        if (j % L - SGid + sg_size < L) /* Do not go out  LG*/
+          exit_if_not_equal<T>(acc2_up[j], j - SGid + sg_size, "shuffle2_up");
+      }
+      /* GID XOR SGID */
+      exit_if_not_equal<T>(acc_xor[j], j ^ SGid, "shuffle_xor");
+    }
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what();
+    exit(1);
+  }
+}
+int main() {
+  queue Queue;
+  if (!Queue.get_device().has_extension("cl_intel_subgroups")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  if (Queue.get_device().has_extension("cl_intel_subgroups_short")) {
+    check<short>(Queue);
+    check<unsigned short>(Queue);
+  }
+  check<int>(Queue);
+  check<int, 2>(Queue);
+  check<int, 4>(Queue);
+  check<int, 8>(Queue);
+  check<int, 16>(Queue);
+  check<unsigned int>(Queue);
+  check<unsigned int, 2>(Queue);
+  check<unsigned int, 4>(Queue);
+  check<unsigned int, 8>(Queue);
+  check<unsigned int, 16>(Queue);
+  check<long>(Queue);
+  check<unsigned long>(Queue);
+  if (Queue.get_device().has_extension("cl_khr_fp16")) {
+    check<half>(Queue);
+  }
+  check<float>(Queue);
+  if (Queue.get_device().has_extension("cl_khr_fp64")) {
+    check<double>(Queue);
+  }
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
diff --git a/SYCL/Basic/sub_group/vote.cpp b/SYCL/Basic/sub_group/vote.cpp
new file mode 100644
index 0000000000..df6c5595fb
--- /dev/null
+++ b/SYCL/Basic/sub_group/vote.cpp
@@ -0,0 +1,89 @@
+// UNSUPPORTED: cuda
+// CUDA compilation and runtime do not yet support sub-groups.
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+//==--------------- vote.cpp - SYCL sub_group vote test --*- C++ -*---------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "helper.hpp"
+#include <CL/sycl.hpp>
+using namespace cl::sycl;
+
+void check(queue Queue, const int G, const int L, const int D, const int R) {
+  try {
+    int max_sg =
+        Queue.get_device().get_info<info::device::max_num_sub_groups>();
+    int num_sg = (L) / max_sg + ((L) % max_sg ? 1 : 0);
+    range<1> GRange(G), LRange(L);
+    nd_range<1> NdRange(GRange, LRange);
+    buffer<int, 1> sganybuf(G);
+    buffer<int, 1> sgallbuf(G);
+
+    // Initialise buffer with zeros
+    Queue.submit([&](handler &cgh) {
+      auto sganyacc = sganybuf.get_access<access::mode::read_write>(cgh);
+      auto sgallacc = sgallbuf.get_access<access::mode::read_write>(cgh);
+      cgh.parallel_for<class init>(range<1>{(unsigned)G}, [=](id<1> index) {
+        sganyacc[index] = 0;
+        sgallacc[index] = 0;
+      });
+    });
+
+    Queue.submit([&](handler &cgh) {
+      auto sganyacc = sganybuf.get_access<access::mode::read_write>(cgh);
+      auto sgallacc = sgallbuf.get_access<access::mode::read_write>(cgh);
+      cgh.parallel_for<class init_bufs>(NdRange, [=](nd_item<1> NdItem) {
+        sganyacc[NdItem.get_global_id()] = 0;
+        sgallacc[NdItem.get_global_id()] = 0;
+      });
+    });
+
+    Queue.submit([&](handler &cgh) {
+      auto sganyacc = sganybuf.get_access<access::mode::read_write>(cgh);
+      auto sgallacc = sgallbuf.get_access<access::mode::read_write>(cgh);
+      cgh.parallel_for<class subgr>(NdRange, [=](nd_item<1> NdItem) {
+        intel::sub_group SG = NdItem.get_sub_group();
+        /* Set to 1 if any local ID in subgroup devided by D has remainder R */
+        if (any_of(SG, SG.get_local_id().get(0) % D == R)) {
+          sganyacc[NdItem.get_global_id()] = 1;
+        }
+        /* Set to 1 if remainder of division of subgroup local ID by D is less
+         * than R for all work items in subgroup */
+        if (all_of(SG, SG.get_local_id().get(0) % D < R)) {
+          sgallacc[NdItem.get_global_id()] = 1;
+        }
+      });
+    });
+    auto sganyacc = sganybuf.get_access<access::mode::read_write>();
+    auto sgallacc = sgallbuf.get_access<access::mode::read_write>();
+    for (int j = 0; j < G; j++) {
+      exit_if_not_equal(sganyacc[j], (int)(D > R), "any");
+      exit_if_not_equal(sgallacc[j], (int)(D <= R), "all");
+    }
+
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what();
+    exit(1);
+  }
+}
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  check(Queue, 240, 80, 9, 8);
+  check(Queue, 24, 12, 9, 10);
+  check(Queue, 1024, 256, 9, 8);
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/SYCL/Basic/usm/allocator_vector.cpp b/SYCL/Basic/usm/allocator_vector.cpp
new file mode 100644
index 0000000000..e111ce873e
--- /dev/null
+++ b/SYCL/Basic/usm/allocator_vector.cpp
@@ -0,0 +1,130 @@
+// XFAIL: cuda || level0
+// piextUSM*Alloc functions for CUDA are not behaving as described in
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out
+// RUN: %HOST_RUN_PLACEHOLDER %t1.out
+// RUN: %CPU_RUN_PLACEHOLDER %t1.out
+// RUN: %GPU_RUN_PLACEHOLDER %t1.out
+
+//==---- allocator_vector.cpp - Allocator Container test -------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+#include <vector>
+
+using namespace cl::sycl;
+
+const int N = 8;
+
+class foo;
+int main() {
+  queue q;
+  auto dev = q.get_device();
+  auto ctxt = q.get_context();
+
+  if (dev.get_info<info::device::usm_host_allocations>()) {
+    usm_allocator<int, usm::alloc::host> alloc(ctxt, dev);
+
+    std::vector<int, decltype(alloc)> vec(alloc);
+    vec.resize(N);
+
+    for (int i = 0; i < N; i++) {
+      vec[i] = i;
+    }
+
+    int *res = &vec[0];
+    int *vals = &vec[0];
+
+    auto e1 = q.submit([=](handler &h) {
+      h.single_task<class foo>([=]() {
+        for (int i = 1; i < N; i++) {
+          res[0] += vals[i];
+        }
+      });
+    });
+
+    e1.wait();
+
+    int answer = (N * (N - 1)) / 2;
+
+    if (vec[0] != answer)
+      return -1;
+  }
+
+  if (dev.get_info<info::device::usm_shared_allocations>()) {
+    usm_allocator<int, usm::alloc::shared> alloc(ctxt, dev);
+
+    std::vector<int, decltype(alloc)> vec(alloc);
+    vec.resize(N);
+
+    for (int i = 0; i < N; i++) {
+      vec[i] = i;
+    }
+
+    int *res = &vec[0];
+    int *vals = &vec[0];
+
+    auto e1 = q.submit([=](handler &h) {
+      h.single_task<class bar>([=]() {
+        for (int i = 1; i < N; i++) {
+          res[0] += vals[i];
+        }
+      });
+    });
+
+    e1.wait();
+
+    int answer = (N * (N - 1)) / 2;
+
+    if (vec[0] != answer)
+      return -1;
+  }
+
+  if (dev.get_info<info::device::usm_device_allocations>()) {
+    usm_allocator<int, usm::alloc::device> alloc(ctxt, dev);
+
+    std::vector<int, decltype(alloc)> vec(alloc);
+    vec.resize(N);
+
+    int *res = &vec[0];
+    int *vals = &vec[0];
+
+    auto e0 = q.submit([=](handler &h) {
+      h.single_task<class baz_init>([=]() {
+        res[0] = 0;
+        for (int i = 0; i < N; i++) {
+          vals[i] = i;
+        }
+      });
+    });
+
+    auto e1 = q.submit([=](handler &h) {
+      h.depends_on(e0);
+      h.single_task<class baz>([=]() {
+        for (int i = 1; i < N; i++) {
+          res[0] += vals[i];
+        }
+      });
+    });
+
+    e1.wait();
+
+    int answer = (N * (N - 1)) / 2;
+    int result;
+    q.memcpy(&result, res, sizeof(int));
+    q.wait();
+
+    if (result != answer)
+      return -1;
+  }
+
+  return 0;
+}
diff --git a/SYCL/Basic/usm/allocator_vector_fail.cpp b/SYCL/Basic/usm/allocator_vector_fail.cpp
new file mode 100644
index 0000000000..5a310c97ff
--- /dev/null
+++ b/SYCL/Basic/usm/allocator_vector_fail.cpp
@@ -0,0 +1,48 @@
+// XFAIL: cuda
+// piextUSM*Alloc functions for CUDA are not behaving as described in
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out
+// RUN: %HOST_RUN_PLACEHOLDER %t1.out
+// RUN: %CPU_RUN_PLACEHOLDER %t1.out
+// RUN: %GPU_RUN_PLACEHOLDER %t1.out
+
+//==-- allocator_vector_fail.cpp - Device Memory Allocator fail test -------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+#include <vector>
+
+using namespace cl::sycl;
+
+const int N = 8;
+
+class foo;
+int main() {
+  queue q;
+  auto dev = q.get_device();
+  auto ctxt = q.get_context();
+
+  if (dev.get_info<info::device::usm_device_allocations>()) {
+    try {
+      usm_allocator<int, usm::alloc::device> alloc(ctxt, dev);
+      std::vector<int, decltype(alloc)> vec(alloc);
+
+      // This statement should throw an exception since
+      // device pointers may not be accessed on the host.
+      vec.assign(N, 42);
+    } catch (feature_not_supported) {
+      return 0;
+    }
+
+    return -1;
+  }
+  return 0;
+}
diff --git a/SYCL/Basic/usm/allocatorll.cpp b/SYCL/Basic/usm/allocatorll.cpp
new file mode 100644
index 0000000000..f40a1bb84f
--- /dev/null
+++ b/SYCL/Basic/usm/allocatorll.cpp
@@ -0,0 +1,88 @@
+// XFAIL: cuda
+// piextUSM*Alloc functions for CUDA are not behaving as described in
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out
+// RUN: %HOST_RUN_PLACEHOLDER %t1.out
+// RUN: %CPU_RUN_PLACEHOLDER %t1.out
+// RUN: %GPU_RUN_PLACEHOLDER %t1.out
+
+//==---- allocatorll.cpp - Device Memory Linked List Allocator test --------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+using namespace cl::sycl;
+
+int numNodes = 4;
+
+struct Node {
+  Node() : pNext(nullptr), Num(0xDEADBEEF) {}
+
+  Node *pNext;
+  uint32_t Num;
+};
+
+class foo;
+int main() {
+  queue q;
+  auto dev = q.get_device();
+  auto ctxt = q.get_context();
+
+  if (!dev.get_info<info::device::usm_device_allocations>())
+    return 0;
+
+  usm_allocator<Node, usm::alloc::device> alloc(ctxt, dev);
+  Node h_cur;
+
+  Node *d_head = alloc.allocate(1);
+  Node *d_cur = d_head;
+
+  for (int i = 0; i < numNodes; i++) {
+    h_cur.Num = i * 2;
+
+    if (i != (numNodes - 1)) {
+      h_cur.pNext = alloc.allocate(1);
+    } else {
+      h_cur.pNext = nullptr;
+    }
+
+    event e0 = q.memcpy(d_cur, &h_cur, sizeof(Node));
+    e0.wait();
+
+    d_cur = h_cur.pNext;
+  }
+
+  auto e1 = q.submit([=](handler &cgh) {
+    cgh.single_task<class foo>([=]() {
+      Node *pHead = d_head;
+      while (pHead) {
+        pHead->Num = pHead->Num * 2 + 1;
+        pHead = pHead->pNext;
+      }
+    });
+  });
+
+  e1.wait();
+
+  d_cur = d_head;
+  for (int i = 0; i < numNodes; i++) {
+    event c = q.memcpy(&h_cur, d_cur, sizeof(Node));
+    c.wait();
+    alloc.deallocate(d_cur, 1);
+
+    const int want = i * 4 + 1;
+    if (h_cur.Num != want) {
+      return -2;
+    }
+    d_cur = h_cur.pNext;
+  }
+
+  return 0;
+}
diff --git a/SYCL/Basic/usm/badmalloc.cpp b/SYCL/Basic/usm/badmalloc.cpp
new file mode 100644
index 0000000000..2c14c41676
--- /dev/null
+++ b/SYCL/Basic/usm/badmalloc.cpp
@@ -0,0 +1,78 @@
+// UNSUPPORTED: windows
+// XFAIL: cuda
+// piextUSM*Alloc functions for CUDA are not behaving as described in
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out
+// RUN: %HOST_RUN_PLACEHOLDER %t1.out
+// RUN: %CPU_RUN_PLACEHOLDER %t1.out
+// RUN: %GPU_RUN_PLACEHOLDER %t1.out
+
+//==----------------- badmalloc.cpp - Bad Mallocs test ---------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This test verifies that things fail in the proper way when they should.
+
+#include <CL/sycl.hpp>
+#include <iostream>
+
+using namespace cl::sycl;
+
+int main(int argc, char *argv[]) {
+  queue q;
+
+  // Good size, bad type
+  auto p = malloc(8, q, usm::alloc::unknown);
+  if (p != nullptr)
+    return 1;
+
+  // Bad size, host
+  p = malloc(-1, q, usm::alloc::host);
+  std::cout << "p = " << p << std::endl;
+  if (p != nullptr)
+    return 2;
+  p = malloc(-1, q, usm::alloc::device);
+  std::cout << "p = " << p << std::endl;
+  if (p != nullptr)
+    return 3;
+  p = malloc(-1, q, usm::alloc::shared);
+  std::cout << "p = " << p << std::endl;
+  if (p != nullptr)
+    return 4;
+  p = malloc(-1, q, usm::alloc::unknown);
+  std::cout << "p = " << p << std::endl;
+  if (p != nullptr)
+    return 5;
+
+  // Bad size, auto aligned
+  p = aligned_alloc(0, -1, q,  usm::alloc::host);
+  std::cout << "p = " << p << std::endl;
+  if (p != nullptr)
+    return 6;
+  p = aligned_alloc(0, -1, q,  usm::alloc::device);
+  std::cout << "p = " << p << std::endl;
+  if (p != nullptr)
+    return 7;
+  p = aligned_alloc(0, -1, q,  usm::alloc::shared);
+  std::cout << "p = " << p << std::endl;
+  if (p != nullptr)
+    return 8;
+  p = aligned_alloc(0, -1, q,  usm::alloc::unknown);
+  std::cout << "p = " << p << std::endl;
+  if (p != nullptr)
+    return 9;
+
+  // Allocs of 0 undefined, but bad type
+  p = aligned_alloc(4, 0, q,  usm::alloc::unknown);
+  std::cout << "p = " << p << std::endl;
+  if (p != nullptr)
+    return 10;
+
+  return 0;
+}
diff --git a/SYCL/Basic/usm/depends_on.cpp b/SYCL/Basic/usm/depends_on.cpp
new file mode 100644
index 0000000000..0e8602b838
--- /dev/null
+++ b/SYCL/Basic/usm/depends_on.cpp
@@ -0,0 +1,86 @@
+// XFAIL: cuda
+// piextUSM*Alloc functions for CUDA are not behaving as described in
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out
+// RUN: %HOST_RUN_PLACEHOLDER %t1.out
+// RUN: %CPU_RUN_PLACEHOLDER %t1.out
+// RUN: %GPU_RUN_PLACEHOLDER %t1.out
+
+//==----------------- depends_on.cpp - depends_on test ---------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+using namespace cl::sycl;
+
+class foo;
+int main() {
+  const int N = 4;
+  const int MAGIC_NUM = 42;
+
+  queue q;
+  auto dev = q.get_device();
+  auto ctxt = q.get_context();
+
+  if (!(dev.get_info<info::device::usm_device_allocations>() &&
+        dev.get_info<info::device::usm_host_allocations>() &&
+        dev.get_info<info::device::usm_shared_allocations>())) 
+    return 0;
+
+  int *darray = (int *)malloc_device(N * sizeof(int), dev, ctxt);
+  if (darray == nullptr) {
+    return -1;
+  }
+  int *sarray = (int *)malloc_shared(N * sizeof(int), dev, ctxt);
+
+  if (sarray == nullptr) {
+    return -1;
+  }
+
+  int *harray = (int *)malloc_host(N * sizeof(int), ctxt);
+  if (harray == nullptr) {
+    return -1;
+  }
+
+  event e;
+  auto eInit = q.submit([&](handler &cgh) {
+    cgh.depends_on(e);
+    cgh.single_task<class init>([=]() {
+      for (int i = 0; i < N; i++) {
+        sarray[i] = MAGIC_NUM - 1;
+        harray[i] = 1;
+      }
+    });
+  });
+
+  auto eMemset = q.memset(darray, 0, N * sizeof(int));
+
+  auto eKernel = q.submit([=](handler &cgh) {
+    cgh.depends_on({eInit, eMemset});
+    cgh.single_task<class foo>([=]() {
+      for (int i = 0; i < N; i++) {
+        sarray[i] += darray[i] + harray[i];
+      }
+    });
+  });
+
+  eKernel.wait();
+
+  for (int i = 0; i < N; i++) {
+    if (sarray[i] != MAGIC_NUM) {
+      return -2;
+    }
+  }
+  free(darray, ctxt);
+  free(sarray, ctxt);
+  free(harray, ctxt);
+
+  return 0;
+}
diff --git a/SYCL/Basic/usm/dmemll.cpp b/SYCL/Basic/usm/dmemll.cpp
new file mode 100644
index 0000000000..bbbbae1213
--- /dev/null
+++ b/SYCL/Basic/usm/dmemll.cpp
@@ -0,0 +1,93 @@
+// XFAIL: cuda
+// piextUSM*Alloc functions for CUDA are not behaving as described in
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out
+// RUN: %HOST_RUN_PLACEHOLDER %t1.out
+// RUN: %CPU_RUN_PLACEHOLDER %t1.out
+// RUN: %GPU_RUN_PLACEHOLDER %t1.out
+
+//==------------------- dmemll.cpp - Device Memory Linked List test --------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+using namespace cl::sycl;
+
+int numNodes = 4;
+
+struct Node {
+  Node() : pNext(nullptr), Num(0xDEADBEEF) {}
+
+  Node *pNext;
+  uint32_t Num;
+};
+
+class foo;
+int main() {
+  queue q;
+  auto dev = q.get_device();
+  auto ctxt = q.get_context();
+
+  if (!dev.get_info<info::device::usm_device_allocations>())
+    return 0;
+
+  Node h_cur;
+
+  Node *d_head = (Node *)malloc_device(sizeof(Node), dev, ctxt);
+  if (d_head == nullptr) {
+    return -1;
+  }
+  Node *d_cur = d_head;
+
+  for (int i = 0; i < numNodes; i++) {
+    h_cur.Num = i * 2;
+
+    if (i != (numNodes - 1)) {
+      h_cur.pNext = (Node *)malloc_device(sizeof(Node), dev, ctxt);
+      if (h_cur.pNext == nullptr) {
+        return -1;
+      }
+    } else {
+      h_cur.pNext = nullptr;
+    }
+
+    event e0 = q.memcpy(d_cur, &h_cur, sizeof(Node));
+    e0.wait();
+
+    d_cur = h_cur.pNext;
+  }
+
+  auto e1 = q.submit([=](handler &cgh) {
+    cgh.single_task<class foo>([=]() {
+      Node *pHead = d_head;
+      while (pHead) {
+        pHead->Num = pHead->Num * 2 + 1;
+        pHead = pHead->pNext;
+      }
+    });
+  });
+
+  e1.wait();
+
+  d_cur = d_head;
+  for (int i = 0; i < numNodes; i++) {
+    event c = q.memcpy(&h_cur, d_cur, sizeof(Node));
+    c.wait();
+    free(d_cur, ctxt);
+
+    const int want = i * 4 + 1;
+    if (h_cur.Num != want) {
+      return -2;
+    }
+    d_cur = h_cur.pNext;
+  }
+
+  return 0;
+}
diff --git a/SYCL/Basic/usm/dmemllaligned.cpp b/SYCL/Basic/usm/dmemllaligned.cpp
new file mode 100644
index 0000000000..6daeb8adca
--- /dev/null
+++ b/SYCL/Basic/usm/dmemllaligned.cpp
@@ -0,0 +1,90 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out
+// RUN: %HOST_RUN_PLACEHOLDER %t1.out
+// RUN: %CPU_RUN_PLACEHOLDER %t1.out
+// RUN: %GPU_RUN_PLACEHOLDER %t1.out
+
+//==---- dmemllaligned.cpp - Aligned Device Memory Linked List test --------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+using namespace cl::sycl;
+
+int numNodes = 4;
+
+struct Node {
+  Node() : pNext(nullptr), Num(0xDEADBEEF) {}
+
+  Node *pNext;
+  uint32_t Num;
+};
+
+class foo;
+int main() {
+  queue q;
+  auto dev = q.get_device();
+  auto ctxt = q.get_context();
+
+  if (!dev.get_info<info::device::usm_device_allocations>())
+    return 0;
+
+  Node h_cur;
+
+  Node *d_head =
+      (Node *)aligned_alloc_device(alignof(Node), sizeof(Node), dev, ctxt);
+  if (d_head == nullptr) {
+    return -1;
+  }
+  Node *d_cur = d_head;
+
+  for (int i = 0; i < numNodes; i++) {
+    h_cur.Num = i * 2;
+
+    if (i != (numNodes - 1)) {
+      h_cur.pNext =
+          (Node *)aligned_alloc_device(alignof(Node), sizeof(Node), dev, ctxt);
+      if (h_cur.pNext == nullptr) {
+        return -1;
+      }
+    } else {
+      h_cur.pNext = nullptr;
+    }
+
+    event e0 = q.memcpy(d_cur, &h_cur, sizeof(Node));
+    e0.wait();
+
+    d_cur = h_cur.pNext;
+  }
+
+  auto e1 = q.submit([=](handler &cgh) {
+    cgh.single_task<class foo>([=]() {
+      Node *pHead = d_head;
+      while (pHead) {
+        pHead->Num = pHead->Num * 2 + 1;
+        pHead = pHead->pNext;
+      }
+    });
+  });
+
+  e1.wait();
+
+  d_cur = d_head;
+  for (int i = 0; i < numNodes; i++) {
+    event c = q.memcpy(&h_cur, d_cur, sizeof(Node));
+    c.wait();
+    free(d_cur, ctxt);
+
+    const int want = i * 4 + 1;
+    if (h_cur.Num != want) {
+      return -2;
+    }
+    d_cur = h_cur.pNext;
+  }
+
+  return 0;
+}
diff --git a/SYCL/Basic/usm/findplatforms.hpp b/SYCL/Basic/usm/findplatforms.hpp
new file mode 100644
index 0000000000..592464385a
--- /dev/null
+++ b/SYCL/Basic/usm/findplatforms.hpp
@@ -0,0 +1,45 @@
+//==------------------- findplatforms.hpp ----------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+bool findPlatformAndDevice(cl_device_type deviceType,
+                           cl_platform_id &platformOut, cl_device_id &deviceOut) {
+  cl_uint numPlatforms;
+  cl_int errorCode;
+
+  errorCode = clGetPlatformIDs(0, nullptr, &numPlatforms);
+  if (errorCode != CL_SUCCESS) return false;
+
+  std::vector<cl_platform_id> platforms(numPlatforms);
+  errorCode = clGetPlatformIDs(numPlatforms, platforms.data(), nullptr);
+  if (errorCode != CL_SUCCESS) return false;
+
+  for (auto platform : platforms) {
+    cl_uint numDevices = 0;
+    errorCode =
+      clGetDeviceIDs(platform, deviceType, 0, nullptr, &numDevices);
+
+    // This has to check both codes because  if a platform has 0 devices
+    // of deviceType, clGetPlatformIDs returns CL_DEVICE_NOT_FOUND.
+    // We don't want to bail yet as the next platform might have it.
+    // We bail out here if we see something other than those two error codes.
+    if (!(errorCode == CL_SUCCESS || errorCode == CL_DEVICE_NOT_FOUND))
+      return false;
+
+    if (numDevices) {
+      std::vector<cl_device_id> devices(numDevices);
+      errorCode = clGetDeviceIDs(platform, deviceType, numDevices,
+				 devices.data(), nullptr);
+      if (errorCode != CL_SUCCESS) return false;
+
+      platformOut = platform;
+      deviceOut = devices[0];
+      return true;
+    }
+  }
+
+  return false;
+}
diff --git a/SYCL/Basic/usm/hmemll.cpp b/SYCL/Basic/usm/hmemll.cpp
new file mode 100644
index 0000000000..72dd514b74
--- /dev/null
+++ b/SYCL/Basic/usm/hmemll.cpp
@@ -0,0 +1,86 @@
+// XFAIL: cuda
+// piextUSM*Alloc functions for CUDA are not behaving as described in
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out
+// RUN: %HOST_RUN_PLACEHOLDER %t1.out
+// RUN: %CPU_RUN_PLACEHOLDER %t1.out
+// RUN: %GPU_RUN_PLACEHOLDER %t1.out
+
+//==------------------- hmemll.cpp - Host Memory Linked List test ----------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+using namespace cl::sycl;
+
+int numNodes = 4;
+
+struct Node {
+  Node() : pNext(nullptr), Num(0xDEADBEEF) {}
+
+  Node *pNext;
+  uint32_t Num;
+};
+
+class foo;
+int main() {
+  queue q;
+  auto ctxt = q.get_context();
+  auto dev = q.get_device();
+
+  if (!dev.get_info<info::device::usm_host_allocations>())
+    return 0;
+
+  Node *h_head = (Node *)malloc_host(sizeof(Node), ctxt);
+  if (h_head == nullptr) {
+    return -1;
+  }
+  Node *h_cur = h_head;
+
+  for (int i = 0; i < numNodes; i++) {
+    h_cur->Num = i * 2;
+
+    if (i != (numNodes - 1)) {
+      h_cur->pNext = (Node *)malloc_host(sizeof(Node), ctxt);
+      if (h_cur->pNext == nullptr) {
+        return -1;
+      }
+    } else {
+      h_cur->pNext = nullptr;
+    }
+
+    h_cur = h_cur->pNext;
+  }
+
+  auto e1 = q.submit([=](handler &cgh) {
+    cgh.single_task<class foo>([=]() {
+      Node *pHead = h_head;
+      while (pHead) {
+        pHead->Num = pHead->Num * 2 + 1;
+        pHead = pHead->pNext;
+      }
+    });
+  });
+
+  e1.wait();
+
+  h_cur = h_head;
+  for (int i = 0; i < numNodes; i++) {
+    const int want = i * 4 + 1;
+    if (h_cur->Num != want) {
+      return -2;
+    }
+    Node *old = h_cur;
+    h_cur = h_cur->pNext;
+    free(old, ctxt);
+  }
+
+  return 0;
+}
diff --git a/SYCL/Basic/usm/hmemllaligned.cpp b/SYCL/Basic/usm/hmemllaligned.cpp
new file mode 100644
index 0000000000..b08038d068
--- /dev/null
+++ b/SYCL/Basic/usm/hmemllaligned.cpp
@@ -0,0 +1,82 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out
+// RUN: %HOST_RUN_PLACEHOLDER %t1.out
+// RUN: %CPU_RUN_PLACEHOLDER %t1.out
+// RUN: %GPU_RUN_PLACEHOLDER %t1.out
+
+//==---- hmemllaligned.cpp - Aligned Host Memory Linked List test ----------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+using namespace cl::sycl;
+
+int numNodes = 4;
+
+struct Node {
+  Node() : pNext(nullptr), Num(0xDEADBEEF) {}
+
+  Node *pNext;
+  uint32_t Num;
+};
+
+class foo;
+int main() {
+  queue q;
+  auto ctxt = q.get_context();
+  auto dev = q.get_device();
+
+  if (!dev.get_info<info::device::usm_host_allocations>())
+    return 0;
+
+  Node *h_head = (Node *)aligned_alloc_host(alignof(Node), sizeof(Node), ctxt);
+  if (h_head == nullptr) {
+    return -1;
+  }
+  Node *h_cur = h_head;
+
+  for (int i = 0; i < numNodes; i++) {
+    h_cur->Num = i * 2;
+
+    if (i != (numNodes - 1)) {
+      h_cur->pNext =
+          (Node *)aligned_alloc_host(alignof(Node), sizeof(Node), ctxt);
+      if (h_cur->pNext == nullptr) {
+        return -1;
+      }
+    } else {
+      h_cur->pNext = nullptr;
+    }
+
+    h_cur = h_cur->pNext;
+  }
+
+  auto e1 = q.submit([=](handler &cgh) {
+    cgh.single_task<class foo>([=]() {
+      Node *pHead = h_head;
+      while (pHead) {
+        pHead->Num = pHead->Num * 2 + 1;
+        pHead = pHead->pNext;
+      }
+    });
+  });
+
+  e1.wait();
+
+  h_cur = h_head;
+  for (int i = 0; i < numNodes; i++) {
+    const int want = i * 4 + 1;
+    if (h_cur->Num != want) {
+      return -2;
+    }
+    Node *old = h_cur;
+    h_cur = h_cur->pNext;
+    free(old, ctxt);
+  }
+  
+  return 0;
+}
diff --git a/SYCL/Basic/usm/math.cpp b/SYCL/Basic/usm/math.cpp
new file mode 100644
index 0000000000..583a9fb9cd
--- /dev/null
+++ b/SYCL/Basic/usm/math.cpp
@@ -0,0 +1,134 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+
+// REQUIRES: cpu
+// XFAIL: cuda
+// TODO: ptxas fatal   : Unresolved extern function '_Z20__spirv_ocl_lgamma_rfPi'
+
+#include <CL/sycl.hpp>
+
+#include <array>
+#include <cassert>
+#include <cmath>
+
+namespace s = cl::sycl;
+
+int main() {
+  s::queue myQueue;
+
+  if (myQueue.get_device()
+          .get_info<s::info::device::usm_shared_allocations>()) {
+    // fract with unified shared memory
+    {
+      s::cl_float r{0};
+      s::cl_float i{999};
+      {
+        s::cl_float *Buf = (s::cl_float *)s::malloc_shared(
+            sizeof(cl_float) * 2, myQueue.get_device(), myQueue.get_context());
+        s::malloc_shared(100, myQueue.get_device(), myQueue.get_context());
+        myQueue.submit([&](s::handler &cgh) {
+          cgh.single_task<class fractF1UF1>(
+              [=]() { Buf[0] = s::fract(s::cl_float{1.5f}, &Buf[1]); });
+        });
+        myQueue.wait();
+        r = Buf[0];
+        i = Buf[1];
+        s::free(Buf, myQueue.get_context());
+      }
+      assert(r == 0.5f);
+      assert(i == 1.0f);
+    }
+
+    // vector fract with unified shared memory
+    {
+      s::cl_float2 *Buf = (s::cl_float2 *)s::malloc_shared(
+          sizeof(cl_float2) * 2, myQueue.get_device(), myQueue.get_context());
+      myQueue.submit([&](s::handler &cgh) {
+        cgh.single_task<class fractF2UF2>([=]() {
+          Buf[0] = s::fract(s::cl_float2{1.5f, 2.5f}, &Buf[1]);
+        });
+      });
+      myQueue.wait();
+
+      s::cl_float r1 = Buf[0].x();
+      s::cl_float r2 = Buf[0].y();
+      s::cl_float i1 = Buf[1].x();
+      s::cl_float i2 = Buf[1].y();
+
+      assert(r1 == 0.5f);
+      assert(r2 == 0.5f);
+      assert(i1 == 1.0f);
+      assert(i2 == 2.0f);
+    }
+
+    // lgamma_r with unified shared memory
+    {
+      s::cl_float r{0};
+      s::cl_int i{999};
+      {
+        s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+        s::cl_int *BufI = (s::cl_int *)s::malloc_shared(
+            sizeof(cl_int) * 2, myQueue.get_device(), myQueue.get_context());
+        myQueue.submit([&](s::handler &cgh) {
+          auto AccR = BufR.get_access<s::access::mode::read_write>(cgh);
+          cgh.single_task<class lgamma_rF1PI1>(
+              [=]() { AccR[0] = s::lgamma_r(s::cl_float{10.f}, BufI); });
+        });
+        myQueue.wait();
+        i = *BufI;
+        s::free(BufI, myQueue.get_context());
+      }
+      assert(r > 12.8017f && r < 12.8019f); // ~12.8018
+      assert(i == 1);                       // tgamma of 10 is ~362880.0
+    }
+
+    // lgamma_r with unified shared memory
+    {
+      s::cl_float r{0};
+      s::cl_int i{999};
+      {
+        s::buffer<s::cl_float, 1> BufR(&r, s::range<1>(1));
+        s::cl_int *BufI = (s::cl_int *)s::malloc_shared(
+            sizeof(cl_int) * 2, myQueue.get_device(), myQueue.get_context());
+        myQueue.submit([&](s::handler &cgh) {
+          auto AccR = BufR.get_access<s::access::mode::read_write>(cgh);
+          cgh.single_task<class lgamma_rF1PI1_neg>(
+              [=]() { AccR[0] = s::lgamma_r(s::cl_float{-2.4f}, BufI); });
+        });
+        myQueue.wait();
+        i = *BufI;
+        s::free(BufI, myQueue.get_context());
+      }
+      assert(r > 0.1024f && r < 0.1026f); // ~0.102583
+      assert(i == -1); // tgamma of -2.4 is ~-1.1080299470333461
+    }
+
+    // vector lgamma_r with unified shared memory
+    {
+      s::cl_float2 r{0, 0};
+      s::cl_int2 i{0, 0};
+      s::buffer<s::cl_float2, 1> BufR(&r, s::range<1>(1));
+      s::cl_int2 *BufI = (s::cl_int2 *)s::malloc_shared(
+          sizeof(cl_int2) * 2, myQueue.get_device(), myQueue.get_context());
+      myQueue.submit([&](s::handler &cgh) {
+        auto AccR = BufR.get_access<s::access::mode::read_write>(cgh);
+        cgh.single_task<class lgamma_rF2PF2>([=]() {
+          AccR[0] = s::lgamma_r(s::cl_float2{10.f, -2.4f}, BufI);
+        });
+      });
+      myQueue.wait();
+
+      s::cl_float r1 = r.x();
+      s::cl_float r2 = r.y();
+      s::cl_int i1 = BufI->x();
+      s::cl_int i2 = BufI->y();
+
+      assert(r1 > 12.8017f && r1 < 12.8019f); // ~12.8018
+      assert(r2 > 0.1024f && r2 < 0.1026f);   // ~0.102583
+      assert(i1 == 1);                        // tgamma of 10 is ~362880.0
+      assert(i2 == -1); // tgamma of -2.4 is ~-1.1080299470333461
+    }
+  }
+  return 0;
+}
diff --git a/SYCL/Basic/usm/memadvise.cpp b/SYCL/Basic/usm/memadvise.cpp
new file mode 100644
index 0000000000..87e4c6f47b
--- /dev/null
+++ b/SYCL/Basic/usm/memadvise.cpp
@@ -0,0 +1,87 @@
+// XFAIL: cuda
+// SYCL runtime and piextUSM*Alloc functions for CUDA not behaving as described
+// in: https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out
+// RUN: %HOST_RUN_PLACEHOLDER %t1.out
+// RUN: %CPU_RUN_PLACEHOLDER %t1.out
+// RUN: %GPU_RUN_PLACEHOLDER %t1.out
+
+//==---------------- memadvise.cpp - Shared Memory Linked List test --------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+using namespace cl::sycl;
+
+int numNodes = 4;
+
+struct Node {
+  Node() : pNext(nullptr), Num(0xDEADBEEF) {}
+
+  Node *pNext;
+  uint32_t Num;
+};
+
+class foo;
+int main() {
+  queue q;
+  auto dev = q.get_device();
+  auto ctxt = q.get_context();
+  if (!dev.get_info<info::device::usm_shared_allocations>())
+    return 0;
+
+  Node *s_head = (Node *)malloc_shared(sizeof(Node), dev, ctxt);
+  if (s_head == nullptr) {
+    return -1;
+  }
+  q.mem_advise(s_head, sizeof(Node), PI_MEM_ADVICE_SET_READ_MOSTLY);
+  Node *s_cur = s_head;
+
+  for (int i = 0; i < numNodes; i++) {
+    s_cur->Num = i * 2;
+
+    if (i != (numNodes - 1)) {
+      s_cur->pNext = (Node *)malloc_shared(sizeof(Node), dev, ctxt);
+      if (s_cur->pNext == nullptr) {
+        return -1;
+      }
+      q.mem_advise(s_cur->pNext, sizeof(Node), PI_MEM_ADVICE_SET_READ_MOSTLY);
+    } else {
+      s_cur->pNext = nullptr;
+    }
+
+    s_cur = s_cur->pNext;
+  }
+
+  auto e1 = q.submit([=](handler &cgh) {
+    cgh.single_task<class foo>([=]() {
+      Node *pHead = s_head;
+      while (pHead) {
+        pHead->Num = pHead->Num * 2 + 1;
+        pHead = pHead->pNext;
+      }
+    });
+  });
+
+  e1.wait();
+
+  s_cur = s_head;
+  int mismatches = 0;
+  for (int i = 0; i < numNodes; i++) {
+    const int want = i * 4 + 1;
+    if (s_cur->Num != want) {
+      return -2;
+    }
+    Node *old = s_cur;
+    s_cur = s_cur->pNext;
+    free(old, ctxt);
+  }
+
+  return 0;
+}
diff --git a/SYCL/Basic/usm/memcpy.cpp b/SYCL/Basic/usm/memcpy.cpp
new file mode 100644
index 0000000000..0b933d0f00
--- /dev/null
+++ b/SYCL/Basic/usm/memcpy.cpp
@@ -0,0 +1,63 @@
+//==---- memcpy.cpp - USM memcpy test --------------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// XFAIL: cuda
+// piextUSM*Alloc functions for CUDA are not behaving as described in
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple  %s -o %t1.out
+// RUN: %CPU_RUN_PLACEHOLDER %t1.out
+// RUN: %GPU_RUN_PLACEHOLDER %t1.out
+
+#include <CL/sycl.hpp>
+
+using namespace cl::sycl;
+
+static constexpr int count = 100;
+
+int main() {
+  queue q([](exception_list el) {
+    for (auto &e : el)
+      std::rethrow_exception(e);
+  });
+  if (q.get_device().get_info<info::device::usm_shared_allocations>()) {
+    float *src = (float *)malloc_shared(sizeof(float) * count, q.get_device(),
+                                        q.get_context());
+    float *dest = (float *)malloc_shared(sizeof(float) * count, q.get_device(),
+                                         q.get_context());
+    for (int i = 0; i < count; i++)
+      src[i] = i;
+
+    event init_copy = q.submit(
+        [&](handler &cgh) { cgh.memcpy(dest, src, sizeof(float) * count); });
+
+    q.submit([&](handler &cgh) {
+      cgh.depends_on(init_copy);
+      cgh.single_task<class double_dest>([=]() {
+        for (int i = 0; i < count; i++)
+          dest[i] *= 2;
+      });
+    });
+    q.wait_and_throw();
+
+    for (int i = 0; i < count; i++) {
+      assert(dest[i] == i * 2);
+    }
+
+    try {
+      // Copying to nullptr should throw.
+      q.submit([&](handler &cgh) {
+        cgh.memcpy(nullptr, src, sizeof(float) * count);
+      });
+      q.wait_and_throw();
+      assert(false && "Expected error from copying to nullptr");
+    } catch (runtime_error e) {
+    }
+  }
+  return 0;
+}
diff --git a/SYCL/Basic/usm/memset.cpp b/SYCL/Basic/usm/memset.cpp
new file mode 100644
index 0000000000..313fa4cbda
--- /dev/null
+++ b/SYCL/Basic/usm/memset.cpp
@@ -0,0 +1,59 @@
+// XFAIL: cuda
+// piextUSM*Alloc functions for CUDA are not behaving as described in
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out
+// RUN: %CPU_RUN_PLACEHOLDER %t1.out
+// RUN: %GPU_RUN_PLACEHOLDER %t1.out
+
+//==---- memset.cpp - USM memset test --------------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include <CL/sycl.hpp>
+
+using namespace cl::sycl;
+
+static constexpr int count = 100;
+
+int main() {
+  queue q([](exception_list el) {
+    for (auto &e : el)
+      std::rethrow_exception(e);
+  });
+  if (q.get_device().get_info<info::device::usm_shared_allocations>()) {
+    uint32_t *src = (uint32_t *)malloc_shared(sizeof(uint32_t) * count,
+                                              q.get_device(), q.get_context());
+
+    event init_copy = q.submit(
+        [&](handler &cgh) { cgh.memset(src, 0x15, sizeof(uint32_t) * count); });
+
+    q.submit([&](handler &cgh) {
+      cgh.depends_on(init_copy);
+      cgh.single_task<class double_dest>([=]() {
+        for (int i = 0; i < count; i++)
+          src[i] *= 2;
+      });
+    });
+    q.wait_and_throw();
+
+    for (int i = 0; i < count; i++) {
+      assert(src[i] == 0x2a2a2a2a);
+    }
+
+    try {
+      // Filling to nullptr should throw.
+      q.submit([&](handler &cgh) {
+        cgh.memset(nullptr, 0, sizeof(uint32_t) * count);
+      });
+      q.wait_and_throw();
+      assert(false && "Expected error from writing to nullptr");
+    } catch (runtime_error e) {
+    }
+  }
+  return 0;
+}
diff --git a/SYCL/Basic/usm/mixed.cpp b/SYCL/Basic/usm/mixed.cpp
new file mode 100644
index 0000000000..afb06370c5
--- /dev/null
+++ b/SYCL/Basic/usm/mixed.cpp
@@ -0,0 +1,79 @@
+// XFAIL: cuda
+// piextUSM*Alloc functions for CUDA are not behaving as described in
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out
+// RUN: %HOST_RUN_PLACEHOLDER %t1.out
+// RUN: %CPU_RUN_PLACEHOLDER %t1.out
+// RUN: %GPU_RUN_PLACEHOLDER %t1.out
+
+//==------------------- mixed.cpp - Mixed Memory test ---------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+using namespace cl::sycl;
+
+class foo;
+int main() {
+  const int N = 4;
+  const int MAGIC_NUM = 42;
+
+  queue q;
+  auto dev = q.get_device();
+  auto ctxt = q.get_context();
+
+  if (!(dev.get_info<info::device::usm_device_allocations>() &&
+        dev.get_info<info::device::usm_shared_allocations>() &&
+        dev.get_info<info::device::usm_host_allocations>()))
+    return 0;
+  
+  int *darray = (int *)malloc_device(N * sizeof(int), dev, ctxt);
+  if (darray == nullptr) {
+    return -1;
+  }
+  int *sarray = (int *)malloc_shared(N * sizeof(int), dev, ctxt);
+
+  if (sarray == nullptr) {
+    return -1;
+  }
+
+  int *harray = (int *)malloc_host(N * sizeof(int), ctxt);
+  if (harray == nullptr) {
+    return -1;
+  }
+  for (int i = 0; i < N; i++) {
+    sarray[i] = MAGIC_NUM - 1;
+    harray[i] = 1;
+  }
+
+  auto e0 = q.memset(darray, 0, N * sizeof(int));
+  e0.wait();
+
+  auto e1 = q.submit([=](handler &cgh) {
+    cgh.single_task<class foo>([=]() {
+      for (int i = 0; i < N; i++) {
+        sarray[i] += darray[i] + harray[i];
+      }
+    });
+  });
+
+  e1.wait();
+
+  for (int i = 0; i < N; i++) {
+    if (sarray[i] != MAGIC_NUM) {
+      return -2;
+    }
+  }
+  free(darray, ctxt);
+  free(sarray, ctxt);
+  free(harray, ctxt);
+
+  return 0;
+}
diff --git a/SYCL/Basic/usm/mixed2.cpp b/SYCL/Basic/usm/mixed2.cpp
new file mode 100644
index 0000000000..72c15cf055
--- /dev/null
+++ b/SYCL/Basic/usm/mixed2.cpp
@@ -0,0 +1,79 @@
+// XFAIL: cuda
+// piextUSM*Alloc functions for CUDA are not behaving as described in
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out
+// RUN: %HOST_RUN_PLACEHOLDER %t1.out
+// RUN: %CPU_RUN_PLACEHOLDER %t1.out
+// RUN: %GPU_RUN_PLACEHOLDER %t1.out
+
+//==------------------- mixed2.cpp - Mixed Memory test ---------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+using namespace cl::sycl;
+
+class foo;
+int main() {
+  const int N = 4;
+  const int MAGIC_NUM = 42;
+
+  queue q;
+  auto dev = q.get_device();
+  auto ctxt = q.get_context();
+
+  if (!(dev.get_info<info::device::usm_device_allocations>() &&
+        dev.get_info<info::device::usm_shared_allocations>() &&
+        dev.get_info<info::device::usm_host_allocations>()))
+    return 0;
+
+  int *darray = (int *)malloc(N * sizeof(int), dev, ctxt, usm::alloc::device);
+  if (darray == nullptr) {
+    return -1;
+  }
+  int *sarray = (int *)malloc(N * sizeof(int), dev, ctxt, usm::alloc::shared);
+
+  if (sarray == nullptr) {
+    return -1;
+  }
+
+  int *harray = (int *)malloc(N * sizeof(int), dev, ctxt, usm::alloc::host);
+  if (harray == nullptr) {
+    return -1;
+  }
+  for (int i = 0; i < N; i++) {
+    sarray[i] = MAGIC_NUM - 1;
+    harray[i] = 1;
+  }
+
+  auto e0 = q.memset(darray, 0, N * sizeof(int));
+  e0.wait();
+
+  auto e1 = q.submit([=](handler &cgh) {
+    cgh.single_task<class foo>([=]() {
+      for (int i = 0; i < N; i++) {
+        sarray[i] += darray[i] + harray[i];
+      }
+    });
+  });
+
+  e1.wait();
+
+  for (int i = 0; i < N; i++) {
+    if (sarray[i] != MAGIC_NUM) {
+      return -2;
+    }
+  }
+  free(darray, ctxt);
+  free(sarray, ctxt);
+  free(harray, ctxt);
+
+  return 0;
+}
diff --git a/SYCL/Basic/usm/mixed2template.cpp b/SYCL/Basic/usm/mixed2template.cpp
new file mode 100644
index 0000000000..7add1dcb33
--- /dev/null
+++ b/SYCL/Basic/usm/mixed2template.cpp
@@ -0,0 +1,92 @@
+// XFAIL: cuda
+// piextUSM*Alloc functions for CUDA are not behaving as described in
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out
+// RUN: %HOST_RUN_PLACEHOLDER %t1.out
+// RUN: %CPU_RUN_PLACEHOLDER %t1.out
+// RUN: %GPU_RUN_PLACEHOLDER %t1.out
+
+//==---------- mixed2template.cpp - Mixed Memory with Templatestest --------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+using namespace cl::sycl;
+
+class foo;
+int main() {
+  const int N = 4;
+  const int MAGIC_NUM = 42;
+
+  queue q;
+  auto dev = q.get_device();
+  auto ctxt = q.get_context();
+
+  if (!(dev.get_info<info::device::usm_device_allocations>() &&
+        dev.get_info<info::device::usm_shared_allocations>() &&
+        dev.get_info<info::device::usm_host_allocations>()))
+    return 0;
+
+  int *darray = malloc<int>(N, dev, ctxt, usm::alloc::device);
+  if (darray == nullptr) {
+    return -1;
+  }
+  int *sarray = malloc<int>(N, dev, ctxt, usm::alloc::shared);
+
+  if (sarray == nullptr) {
+    return -1;
+  }
+
+  int *harray = malloc<int>(N, dev, ctxt, usm::alloc::host);
+  if (harray == nullptr) {
+    return -1;
+  }
+  for (int i = 0; i < N; i++) {
+    sarray[i] = MAGIC_NUM - 1;
+    harray[i] = 1;
+  }
+
+  auto e0 = q.memset(darray, 0, N * sizeof(int));
+  e0.wait();
+
+  auto e1 = q.submit([=](handler &cgh) {
+    cgh.single_task<class foo>([=]() {
+      for (int i = 0; i < N; i++) {
+        sarray[i] += darray[i] + harray[i];
+      }
+    });
+  });
+
+  e1.wait();
+
+  for (int i = 0; i < N; i++) {
+    if (sarray[i] != MAGIC_NUM) {
+      return -2;
+    }
+  }
+  free(darray, ctxt);
+  free(sarray, ctxt);
+  free(harray, ctxt);
+
+  float *hfarray = malloc<float>(N, q, usm::alloc::host);
+  if (hfarray == nullptr)
+    return -3;
+
+  free(hfarray, ctxt);
+
+  double *sdarray =
+      aligned_alloc<double>(alignof(double), N, q, usm::alloc::shared);
+  if (sdarray == nullptr)
+    return -4;
+
+  free(sdarray, ctxt);
+  
+  return 0;
+}
diff --git a/SYCL/Basic/usm/mixed_queue.cpp b/SYCL/Basic/usm/mixed_queue.cpp
new file mode 100644
index 0000000000..c4174dd508
--- /dev/null
+++ b/SYCL/Basic/usm/mixed_queue.cpp
@@ -0,0 +1,108 @@
+// XFAIL: cuda
+// piextUSM*Alloc functions for CUDA are not behaving as described in
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out
+// RUN: %HOST_RUN_PLACEHOLDER %t1.out
+// RUN: %CPU_RUN_PLACEHOLDER %t1.out
+// RUN: %GPU_RUN_PLACEHOLDER %t1.out
+
+//==-------------- mixed_queue.cpp - Mixed Memory test ---------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+using namespace cl::sycl;
+
+class foo;
+int main() {
+  const int N = 4;
+  const int MAGIC_NUM = 42;
+  const int SIZE = N * sizeof(int);
+  queue q;
+  auto dev = q.get_device();
+  if (!(dev.get_info<info::device::usm_device_allocations>() &&
+        dev.get_info<info::device::usm_host_allocations>() &&
+        dev.get_info<info::device::usm_shared_allocations>()))
+    return 0;
+
+  int *ptr = (int *)malloc_device(SIZE, q);
+  if (ptr == nullptr) {
+    return -1;
+  }
+  free(ptr, q);
+
+  ptr = (int *)malloc(SIZE, q, usm::alloc::device);
+  if (ptr == nullptr) {
+    return -1;
+  }
+  free(ptr, q);
+
+  ptr = (int *)aligned_alloc_device(alignof(int), SIZE, q);
+  if (ptr == nullptr) {
+    return -1;
+  }
+  free(ptr, q);
+
+  ptr = (int *)aligned_alloc(alignof(int), SIZE, q, usm::alloc::device);
+  if (ptr == nullptr) {
+    return -1;
+  }
+  free(ptr, q);
+
+  ptr = (int *)malloc_shared(SIZE, q);
+  if (ptr == nullptr) {
+    return -1;
+  }
+  free(ptr, q);
+
+  ptr = (int *)malloc(SIZE, q, usm::alloc::shared);
+  if (ptr == nullptr) {
+    return -1;
+  }
+  free(ptr, q);
+
+  ptr = (int *)aligned_alloc_shared(alignof(int), SIZE, q);
+  if (ptr == nullptr) {
+    return -1;
+  }
+  free(ptr, q);
+
+  ptr = (int *)aligned_alloc(alignof(int), SIZE, q, usm::alloc::shared);
+  if (ptr == nullptr) {
+    return -1;
+  }
+  free(ptr, q);
+
+  ptr = (int *)malloc_host(SIZE, q);
+  if (ptr == nullptr) {
+    return -1;
+  }
+  free(ptr, q);
+
+  ptr = (int *)malloc(SIZE, q, usm::alloc::host);
+  if (ptr == nullptr) {
+    return -1;
+  }
+  free(ptr, q);
+
+  ptr = (int *)aligned_alloc_host(alignof(int), SIZE, q);
+  if (ptr == nullptr) {
+    return -1;
+  }
+  free(ptr, q);
+
+  ptr = (int *)aligned_alloc(alignof(int), SIZE, q, usm::alloc::host);
+  if (ptr == nullptr) {
+    return -1;
+  }
+  free(ptr, q);
+
+  return 0;
+}
diff --git a/SYCL/Basic/usm/multictxt.cpp b/SYCL/Basic/usm/multictxt.cpp
new file mode 100644
index 0000000000..59536945ed
--- /dev/null
+++ b/SYCL/Basic/usm/multictxt.cpp
@@ -0,0 +1,66 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out
+// RUN: %t1.out
+
+// REQUIRES: cpu, gpu
+
+//==----------------- multictxt.cpp - Multi Context USM test ---------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+using namespace cl::sycl;
+
+// The multictxt test here is a sanity check that USM selects the right
+// implementation when presented with multiple contexts. The extra context
+// only needs to exist for this test to do its job.
+
+void GpuCpuCpu() {
+  queue gpu_q(gpu_selector{});
+  queue cpu_q(cpu_selector{});
+  device dev = cpu_q.get_device();
+  context ctx = cpu_q.get_context();
+  if (dev.get_info<info::device::usm_shared_allocations>()) {
+    void *ptr = malloc_shared(128, dev, ctx);
+
+    free(ptr, ctx);
+  }
+}
+
+void CpuGpuGpu() {
+  queue cpu_q(cpu_selector{});
+  queue gpu_q(gpu_selector{});
+  device dev = gpu_q.get_device();
+  context ctx = gpu_q.get_context();
+
+  if (dev.get_info<info::device::usm_shared_allocations>()) {
+    void *ptr = malloc_shared(128, dev, ctx);
+
+    free(ptr, ctx);
+  }
+}
+
+void GpuCpuGpu() {
+  queue gpu_q(gpu_selector{});
+  queue cpu_q(cpu_selector{});
+  device dev = gpu_q.get_device();
+  context ctx = gpu_q.get_context();
+
+  if (dev.get_info<info::device::usm_shared_allocations>()) {
+    void *ptr = malloc_shared(128, dev, ctx);
+
+    free(ptr, ctx);
+  }
+}
+
+int main() {
+  GpuCpuCpu();
+  CpuGpuGpu();
+  GpuCpuGpu();
+  
+  return 0;
+}
diff --git a/SYCL/Basic/usm/pfor_flatten.cpp b/SYCL/Basic/usm/pfor_flatten.cpp
new file mode 100644
index 0000000000..c629a143b9
--- /dev/null
+++ b/SYCL/Basic/usm/pfor_flatten.cpp
@@ -0,0 +1,71 @@
+// UNSUPPORTED: cuda
+// CUDA does not support the unnamed lambda extension.
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple  -fsycl-unnamed-lambda %s -o %t1.out
+// RUN: %HOST_RUN_PLACEHOLDER %t1.out
+// RUN: %CPU_RUN_PLACEHOLDER %t1.out
+// RUN: %GPU_RUN_PLACEHOLDER %t1.out
+
+//==--------------- pfor_flatten.cpp - Kernel Launch Flattening test -------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+using namespace cl::sycl;
+
+class foo;
+int main() {
+  int *array = nullptr;
+  const int N = 42;
+  const int MAGIC_NUM = 42;
+
+  queue q;
+  auto ctxt = q.get_context();
+
+  array = (int *)malloc_host(N * sizeof(int), q);
+  if (array == nullptr) {
+    return -1;
+  }
+
+  range<1> R{N};
+  auto e1 = q.parallel_for(R, [=](id<1> ID) {
+    int i = ID[0];
+    array[i] = MAGIC_NUM-4;
+  });
+
+
+  auto e2 = q.parallel_for(R, e1, [=](id<1> ID) {
+    int i = ID[0];
+    array[i] += 2;
+  });
+
+  auto e3 =
+      q.parallel_for(nd_range<1>{R, range<1>{1}}, {e1, e2}, [=](nd_item<1> ID) {
+        int i = ID.get_global_id(0);
+        array[i]++;
+      });
+
+  auto e4 = q.single_task({e3}, [=]() {
+    for (int i = 0; i < N; i++) {
+      array[i]++;
+    }
+  });
+
+  q.single_task(e4, [=]() { array[0] = array[0]; });
+
+  q.wait();
+  
+  for (int i = 0; i < N; i++) {
+    if (array[i] != MAGIC_NUM) {
+      return -1;
+    }
+  }
+  free(array, ctxt);
+
+  return 0;
+}
diff --git a/SYCL/Basic/usm/pointer_query.cpp b/SYCL/Basic/usm/pointer_query.cpp
new file mode 100644
index 0000000000..87ab37dbef
--- /dev/null
+++ b/SYCL/Basic/usm/pointer_query.cpp
@@ -0,0 +1,123 @@
+// RUN: %clangxx -fsycl %s -o %t1.out
+// RUN: %HOST_RUN_PLACEHOLDER %t1.out
+
+//==-------------- pointer_query.cpp - Pointer Query test ------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+using namespace cl::sycl;
+
+int main() {
+  int *array = nullptr;
+  const int N = 4;
+  queue q;
+  auto dev = q.get_device();
+  auto ctxt = q.get_context();
+
+  if (!(dev.get_info<info::device::usm_device_allocations>() &&
+        dev.get_info<info::device::usm_shared_allocations>() &&
+        dev.get_info<info::device::usm_host_allocations>()))
+    return 0;
+
+  usm::alloc Kind;
+  device D;
+
+  // Test device allocs
+  array = (int *)malloc_device(N * sizeof(int), q);
+  if (array == nullptr) {
+    return 1;
+  }
+  Kind = get_pointer_type(array, ctxt);
+  if (ctxt.is_host()) {
+    // for now, host device treats all allocations
+    // as host allocations
+    if (Kind != usm::alloc::host) {
+      return 2;
+    }
+  } else {
+    if (Kind != usm::alloc::device) {
+      return 3;
+    }
+  }
+  D = get_pointer_device(array, ctxt);
+  if (D != dev) {
+    return 4;
+  }
+  free(array, ctxt);
+
+  // Test shared allocs
+  array = (int *)malloc_shared(N * sizeof(int), q);
+  if (array == nullptr) {
+    return 5;
+  }
+  Kind = get_pointer_type(array, ctxt);
+  if (ctxt.is_host()) {
+    // for now, host device treats all allocations
+    // as host allocations
+    if (Kind != usm::alloc::host) {
+      return 6;
+    }
+  } else {
+    if (Kind != usm::alloc::shared) {
+      return 7;
+    }
+  }
+  D = get_pointer_device(array, ctxt);
+  if (D != dev) {
+    return 8;
+  }
+  free(array, ctxt);
+
+  // Test host allocs
+  array = (int *)malloc_host(N * sizeof(int), q);
+  if (array == nullptr) {
+    return 9;
+  }
+  Kind = get_pointer_type(array, ctxt);
+  if (Kind != usm::alloc::host) {
+    return 10;
+  }
+  D = get_pointer_device(array, ctxt);
+  auto Devs = ctxt.get_devices();
+  auto result = std::find(Devs.begin(), Devs.end(), D);
+  if (result == Devs.end()) {
+    // Returned device was not in queried context
+    return 11;
+  }
+  free(array, ctxt);
+
+  // Test invalid ptrs
+  Kind = get_pointer_type(nullptr, ctxt);
+  if (Kind != usm::alloc::unknown) {
+    return 11;
+  }
+
+  // next checks only valid for non-host contexts
+  array = (int*)malloc(N*sizeof(int));
+  Kind = get_pointer_type(array, ctxt);
+  if (!ctxt.is_host()) {
+    if (Kind != usm::alloc::unknown) {
+      return 12;
+    }
+    try {
+      D = get_pointer_device(array, ctxt);
+    } catch (runtime_error) {
+      return 0;
+    }
+    return 13;
+  } else {
+    // host ctxts always report host
+    if (Kind != usm::alloc::host) {
+      return 14;
+    }
+  }
+  free(array);
+
+  return 0;
+}
diff --git a/SYCL/Basic/usm/prefetch.cpp b/SYCL/Basic/usm/prefetch.cpp
new file mode 100644
index 0000000000..a92786a055
--- /dev/null
+++ b/SYCL/Basic/usm/prefetch.cpp
@@ -0,0 +1,69 @@
+//==---- prefetch.cpp - USM prefetch test ----------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out
+// RUN: %HOST_RUN_PLACEHOLDER %t1.out
+// RUN: %CPU_RUN_PLACEHOLDER %t1.out
+
+#include <CL/sycl.hpp>
+
+using namespace cl::sycl;
+
+static constexpr int count = 100;
+
+int main() {
+  queue q([](exception_list el) {
+    for (auto &e : el)
+      throw e;
+  });
+  if (q.get_device().get_info<info::device::usm_shared_allocations>()) {
+    float *src = (float *)malloc_shared(sizeof(float) * count, q.get_device(),
+                                        q.get_context());
+    float *dest = (float *)malloc_shared(sizeof(float) * count, q.get_device(),
+                                         q.get_context());
+    for (int i = 0; i < count; i++)
+      src[i] = i;
+
+    // Test handler::prefetch
+    {
+      event init_prefetch = q.submit(
+          [&](handler &cgh) { cgh.prefetch(src, sizeof(float) * count); });
+
+      q.submit([&](handler &cgh) {
+        cgh.depends_on(init_prefetch);
+        cgh.single_task<class double_dest>([=]() {
+          for (int i = 0; i < count; i++)
+            dest[i] = 2 * src[i];
+        });
+      });
+      q.wait_and_throw();
+
+      for (int i = 0; i < count; i++) {
+        assert(dest[i] == i * 2);
+      }
+    }
+
+    // Test queue::prefetch
+    {
+      event init_prefetch = q.prefetch(src, sizeof(float) * count);
+
+      q.submit([&](handler &cgh) {
+        cgh.depends_on(init_prefetch);
+        cgh.single_task<class double_dest3>([=]() {
+          for (int i = 0; i < count; i++)
+            dest[i] = 3 * src[i];
+        });
+      });
+      q.wait_and_throw();
+
+      for (int i = 0; i < count; i++) {
+        assert(dest[i] == i * 3);
+      }
+    }
+  }
+  return 0;
+}
diff --git a/SYCL/Basic/usm/queue_wait.cpp b/SYCL/Basic/usm/queue_wait.cpp
new file mode 100644
index 0000000000..692c7f43f5
--- /dev/null
+++ b/SYCL/Basic/usm/queue_wait.cpp
@@ -0,0 +1,48 @@
+// XFAIL: cuda
+// piextUSM*Alloc functions for CUDA are not behaving as described in
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+
+#include <cassert>
+#include <cstddef>
+
+using namespace cl::sycl;
+
+// This test checks that queue USM functions are properly waited for during
+// calls to queue::wait().
+
+int main() {
+  const std::size_t Size = 32;
+  queue Q;
+  std::cout << Q.is_host() << std::endl;
+  device Dev = Q.get_device();
+  context Ctx = Q.get_context();
+  if (!(Dev.get_info<info::device::usm_device_allocations>() &&
+        Dev.get_info<info::device::usm_host_allocations>()))
+    return 0;
+
+  unsigned char *DevArr = (unsigned char *)malloc_device(Size, Dev, Ctx);
+  assert(DevArr);
+  unsigned char *HostArr = (unsigned char *)malloc_host(Size, Ctx);
+  assert(HostArr);
+
+  Q.memset(DevArr, 42, Size);
+  Q.wait();
+  Q.memcpy(HostArr, DevArr, Size);
+  Q.wait();
+
+  for (std::size_t i = 0; i < Size; ++i)
+    assert(HostArr[i] == 42);
+
+  free(DevArr, Ctx);
+  free(HostArr, Ctx);
+
+  return 0;
+}
diff --git a/SYCL/Basic/usm/smemll.cpp b/SYCL/Basic/usm/smemll.cpp
new file mode 100644
index 0000000000..eff0429287
--- /dev/null
+++ b/SYCL/Basic/usm/smemll.cpp
@@ -0,0 +1,86 @@
+// XFAIL: cuda
+// piextUSM*Alloc functions for CUDA are not behaving as described in
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out
+// RUN: %HOST_RUN_PLACEHOLDER %t1.out
+// RUN: %CPU_RUN_PLACEHOLDER %t1.out
+// RUN: %GPU_RUN_PLACEHOLDER %t1.out
+
+//==------------------- smemll.cpp - Shared Memory Linked List test --------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+using namespace cl::sycl;
+
+int numNodes = 4;
+
+struct Node {
+  Node() : pNext(nullptr), Num(0xDEADBEEF) {}
+
+  Node *pNext;
+  uint32_t Num;
+};
+
+class foo;
+int main() {
+  queue q;
+  auto dev = q.get_device();
+  auto ctxt = q.get_context();
+
+  if (!dev.get_info<info::device::usm_shared_allocations>())
+    return 0;
+
+  Node *s_head = (Node *)malloc_shared(sizeof(Node), dev, ctxt);
+  if (s_head == nullptr) {
+    return -1;
+  }
+  Node *s_cur = s_head;
+
+  for (int i = 0; i < numNodes; i++) {
+    s_cur->Num = i * 2;
+
+    if (i != (numNodes - 1)) {
+      s_cur->pNext = (Node *)malloc_shared(sizeof(Node), dev, ctxt);
+      if (s_cur->pNext == nullptr) {
+        return -1;
+      }
+    } else {
+      s_cur->pNext = nullptr;
+    }
+
+    s_cur = s_cur->pNext;
+  }
+
+  auto e1 = q.submit([=](handler &cgh) {
+    cgh.single_task<class foo>([=]() {
+      Node *pHead = s_head;
+      while (pHead) {
+        pHead->Num = pHead->Num * 2 + 1;
+        pHead = pHead->pNext;
+      }
+    });
+  });
+
+  e1.wait();
+
+  s_cur = s_head;
+  for (int i = 0; i < numNodes; i++) {
+    const int want = i * 4 + 1;
+    if (s_cur->Num != want) {
+      return -2;
+    }
+    Node *old = s_cur;
+    s_cur = s_cur->pNext;
+    free(old, ctxt);
+  }
+
+  return 0;
+}
diff --git a/SYCL/Basic/usm/smemllaligned.cpp b/SYCL/Basic/usm/smemllaligned.cpp
new file mode 100644
index 0000000000..6e7ec35400
--- /dev/null
+++ b/SYCL/Basic/usm/smemllaligned.cpp
@@ -0,0 +1,83 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out
+// RUN: %HOST_RUN_PLACEHOLDER %t1.out
+// RUN: %CPU_RUN_PLACEHOLDER %t1.out
+// RUN: %GPU_RUN_PLACEHOLDER %t1.out
+
+//==---- smemllaligned.cpp - Aligned Shared Memory Linked List test --------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+using namespace cl::sycl;
+
+int numNodes = 4;
+
+struct Node {
+  Node() : pNext(nullptr), Num(0xDEADBEEF) {}
+
+  Node *pNext;
+  uint32_t Num;
+};
+
+class foo;
+int main() {
+  queue q;
+  auto dev = q.get_device();
+  auto ctxt = q.get_context();
+
+  if (!dev.get_info<info::device::usm_shared_allocations>())
+    return 0;
+
+  Node *s_head =
+      (Node *)aligned_alloc_shared(alignof(Node), sizeof(Node), dev, ctxt);
+  if (s_head == nullptr) {
+    return -1;
+  }
+  Node *s_cur = s_head;
+
+  for (int i = 0; i < numNodes; i++) {
+    s_cur->Num = i * 2;
+
+    if (i != (numNodes - 1)) {
+      s_cur->pNext =
+          (Node *)aligned_alloc_shared(alignof(Node), sizeof(Node), dev, ctxt);
+      if (s_cur->pNext == nullptr) {
+        return -1;
+      }
+    } else {
+      s_cur->pNext = nullptr;
+    }
+
+    s_cur = s_cur->pNext;
+  }
+
+  auto e1 = q.submit([=](handler &cgh) {
+    cgh.single_task<class foo>([=]() {
+      Node *pHead = s_head;
+      while (pHead) {
+        pHead->Num = pHead->Num * 2 + 1;
+        pHead = pHead->pNext;
+      }
+    });
+  });
+
+  e1.wait();
+
+  s_cur = s_head;
+  for (int i = 0; i < numNodes; i++) {
+    const int want = i * 4 + 1;
+    if (s_cur->Num != want) {
+      return -2;
+    }
+    Node *old = s_cur;
+    s_cur = s_cur->pNext;
+    free(old, ctxt);
+  }
+
+  return 0;
+}
diff --git a/SYCL/CMakeLists.txt b/SYCL/CMakeLists.txt
new file mode 100644
index 0000000000..a6694ae1f4
--- /dev/null
+++ b/SYCL/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_subdirectory(Basic)
+#add_subdirectory(External)
+#add_subdirectory(MultiSource)
+#add_subdirectory(Parallel)
+#add_subdirectory(SingleSource)
diff --git a/SYCL/README.md b/SYCL/README.md
new file mode 100644
index 0000000000..cd84a0dc6d
--- /dev/null
+++ b/SYCL/README.md
@@ -0,0 +1,7 @@
+SYCL-related tests directory.
+
+ - Basic - tests used for sanity testing. Building, executing and checks are defined using insource comments with LIT syntax.
+ - External - contains infrastructure for running tests which sources are stored outside of this repository
+ - MultiSource - SYCL related tests which depend on multiple source file.
+ - SingleSource - SYCL tests with single source file.
+ - Parallel - Tests which produce high-parallel load on taret device. It is recommended to run such tests in 1 thread.
diff --git a/cmake/caches/clang_fsycl.cmake b/cmake/caches/clang_fsycl.cmake
new file mode 100644
index 0000000000..b35fcf023d
--- /dev/null
+++ b/cmake/caches/clang_fsycl.cmake
@@ -0,0 +1,4 @@
+# Default open source clang configuration with SYCL support.
+
+set(CMAKE_BUILD_TYPE "Release" CACHE STRING "")
+set(CMAKE_CXX_FLAGS "-fsycl" CACHE STRING "")
diff --git a/cmake/caches/clang_fsycl_cuda.cmake b/cmake/caches/clang_fsycl_cuda.cmake
new file mode 100644
index 0000000000..549f426ab0
--- /dev/null
+++ b/cmake/caches/clang_fsycl_cuda.cmake
@@ -0,0 +1,4 @@
+# Default open source clang configuration with SYCL support.
+
+set(CMAKE_BUILD_TYPE "Release" CACHE STRING "")
+set(CMAKE_CXX_FLAGS "-fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice -Xsycl-target-backend --cuda-gpu-arch=sm_32" CACHE STRING "")
diff --git a/cmake/caches/dpcpp.cmake b/cmake/caches/dpcpp.cmake
new file mode 100644
index 0000000000..1e31ebbfcc
--- /dev/null
+++ b/cmake/caches/dpcpp.cmake
@@ -0,0 +1,5 @@
+# Default dpcpp compiler configuration.
+
+# No extra command line arguments are needed to support SYCL
+set(CMAKE_BUILD_TYPE "Release" CACHE STRING "")
+set(SYCL_CXX_FLAGS "" CACHE STRING "")