diff --git a/sycl/test/CMakeLists.txt b/sycl/test/CMakeLists.txt
index a152c4b34137e..0fc16451d8fc5 100644
--- a/sycl/test/CMakeLists.txt
+++ b/sycl/test/CMakeLists.txt
@@ -61,6 +61,14 @@ add_lit_testsuites(SYCL-DEPLOY ${CMAKE_CURRENT_SOURCE_DIR}
   EXCLUDE_FROM_CHECK_ALL
   )
 
+add_lit_target(check-sycl-inline-asm
+  "Running lit suite ${CMAKE_CURRENT_SOURCE_DIR}/feature-tests/inline-asm"
+  "feature-tests/inline-asm"
+  ARGS ${RT_TEST_ARGS}
+  PARAMS "SYCL_BE=PI_OPENCL"
+  DEPENDS ${SYCL_TEST_DEPS}
+  )
+
 if(SYCL_BUILD_PI_CUDA)
   add_lit_testsuite(check-sycl-cuda "Running the SYCL regression tests for CUDA"
     ${CMAKE_CURRENT_BINARY_DIR}
diff --git a/sycl/test/feature-tests/inline-asm/asm_16_empty.cpp b/sycl/test/feature-tests/inline-asm/asm_16_empty.cpp
new file mode 100644
index 0000000000000..ad4285e8ecbdb
--- /dev/null
+++ b/sycl/test/feature-tests/inline-asm/asm_16_empty.cpp
@@ -0,0 +1,40 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
+          C[wiID] = 43;
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm volatile("");
+#endif
+        });
+  }
+};
+
+int main() {
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  if (verify_all_the_same(f.getOutputBufferData(), 43))
+    return 0;
+
+  return 1;
+}
diff --git a/sycl/test/feature-tests/inline-asm/asm_16_matrix_mult.cpp b/sycl/test/feature-tests/inline-asm/asm_16_matrix_mult.cpp
new file mode 100644
index 0000000000000..6ae1debb67486
--- /dev/null
+++ b/sycl/test/feature-tests/inline-asm/asm_16_matrix_mult.cpp
@@ -0,0 +1,44 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
+          volatile int output = 0;
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d"
+                       : "=rw"(output));
+#else
+          output = 7;
+#endif
+          C[wiID] = output;
+        });
+  }
+};
+
+int main() {
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  if (verify_all_the_same(f.getOutputBufferData(), 7))
+    return 0;
+
+  return 1;
+}
diff --git a/sycl/test/feature-tests/inline-asm/asm_16_no_input_int.cpp b/sycl/test/feature-tests/inline-asm/asm_16_no_input_int.cpp
new file mode 100644
index 0000000000000..6ae1debb67486
--- /dev/null
+++ b/sycl/test/feature-tests/inline-asm/asm_16_no_input_int.cpp
@@ -0,0 +1,44 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
+          volatile int output = 0;
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d"
+                       : "=rw"(output));
+#else
+          output = 7;
+#endif
+          C[wiID] = output;
+        });
+  }
+};
+
+int main() {
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  if (verify_all_the_same(f.getOutputBufferData(), 7))
+    return 0;
+
+  return 1;
+}
diff --git a/sycl/test/feature-tests/inline-asm/asm_16_no_opts.cpp b/sycl/test/feature-tests/inline-asm/asm_16_no_opts.cpp
new file mode 100644
index 0000000000000..4b6d5146fd6b8
--- /dev/null
+++ b/sycl/test/feature-tests/inline-asm/asm_16_no_opts.cpp
@@ -0,0 +1,45 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
+          for (int i = 0; i < 10; ++i) {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+            asm("fence_sw");
+            C[wiID] += i;
+
+#else
+            C[wiID] += i;
+#endif
+          }
+        });
+  }
+};
+
+int main() {
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  if (verify_all_the_same(f.getOutputBufferData(), 45))
+    return 0;
+
+  return 1;
+}
diff --git a/sycl/test/feature-tests/inline-asm/asm_8_empty.cpp b/sycl/test/feature-tests/inline-asm/asm_8_empty.cpp
new file mode 100644
index 0000000000000..97fae0ed4eb27
--- /dev/null
+++ b/sycl/test/feature-tests/inline-asm/asm_8_empty.cpp
@@ -0,0 +1,40 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+          C[wiID] = 43;
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm volatile("");
+#endif
+        });
+  }
+};
+
+int main() {
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  if (verify_all_the_same(f.getOutputBufferData(), 43))
+    return 0;
+
+  return 1;
+}
diff --git a/sycl/test/feature-tests/inline-asm/asm_8_no_input_int.cpp b/sycl/test/feature-tests/inline-asm/asm_8_no_input_int.cpp
new file mode 100644
index 0000000000000..6d1dcbb832cf2
--- /dev/null
+++ b/sycl/test/feature-tests/inline-asm/asm_8_no_input_int.cpp
@@ -0,0 +1,44 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+          volatile int output = 0;
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm volatile("mov (M1,8) %0(0,0)<1> 0x7:d"
+                       : "=rw"(output));
+#else
+          output = 7;
+#endif
+          C[wiID] = output;
+        });
+  }
+};
+
+int main() {
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  if (verify_all_the_same(f.getOutputBufferData(), 7))
+    return 0;
+
+  return 1;
+}
diff --git a/sycl/test/feature-tests/inline-asm/asm_arbitrary_ops_order.cpp b/sycl/test/feature-tests/inline-asm/asm_arbitrary_ops_order.cpp
new file mode 100644
index 0000000000000..28d0af1d455b6
--- /dev/null
+++ b/sycl/test/feature-tests/inline-asm/asm_arbitrary_ops_order.cpp
@@ -0,0 +1,59 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithInputBuffers<T, 3>, WithOutputBuffer<T> {
+  KernelFunctor(const std::vector<T> &input1, const std::vector<T> &input2, const std::vector<T> &input3) : WithInputBuffers<T, 3>(input1, input2, input3), WithOutputBuffer<T>(input1.size()) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto A = this->getInputBuffer(0).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto B = this->getInputBuffer(1).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto C = this->getInputBuffer(2).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto D = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("mad (M1, 8) %0(0, 0)<1> %3(0, 0)<1;1,0> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>"
+              : "=rw"(D[wiID])
+              : "rw"(B[wiID]), "rw"(C[wiID]), "rw"(A[wiID]));
+#else
+          D[wiID] = A[wiID] * B[wiID] + C[wiID];
+#endif
+        });
+  }
+};
+
+int main() {
+  std::vector<dataType> inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE), inputC(DEFAULT_PROBLEM_SIZE);
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) {
+    inputA[i] = i;
+    inputB[i] = i;
+    inputC[i] = DEFAULT_PROBLEM_SIZE - i * i;
+  }
+
+  KernelFunctor<> f(inputA, inputB, inputC);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  auto &D = f.getOutputBufferData();
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) {
+    if (D[i] != inputA[i] * inputB[i] + inputC[i]) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << D[i] << " != " << inputA[i] * inputB[i] + inputC[i] << "\n";
+      return 1;
+    }
+  }
+  return 0;
+}
diff --git a/sycl/test/feature-tests/inline-asm/asm_decl_in_scope.cpp b/sycl/test/feature-tests/inline-asm/asm_decl_in_scope.cpp
new file mode 100644
index 0000000000000..db30e20f5e9ee
--- /dev/null
+++ b/sycl/test/feature-tests/inline-asm/asm_decl_in_scope.cpp
@@ -0,0 +1,67 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithInputBuffers<T, 2>, WithOutputBuffer<T> {
+  KernelFunctor(const std::vector<T> &input1, const std::vector<T> &input2) : WithInputBuffers<T, 2>(input1, input2), WithOutputBuffer<T>(input1.size()) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto A = this->getInputBuffer(0).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto B = this->getInputBuffer(1).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()},
+        [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
+    // declaration of temp within and outside the scope
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("{\n"
+              ".decl temp v_type=G type=d num_elts=16 align=GRF\n"
+              "mov (M1, 16) temp(0, 0)<1> %1(0, 0)<1;1,0>\n"
+              "mov (M1, 16) %0(0, 0)<1>  temp(0, 0)<1;1,0>\n"
+              "}\n"
+              ".decl temp v_type=G type=d num_elts=16 align=GRF\n"
+              "mul (M1, 16) temp(0, 0)<1> %2(0, 0)<1;1,0> %0(0, 0)<1;1,0>\n"
+              "mov (M1, 16) %0(0, 0)<1>  temp(0, 0)<1;1,0>\n"
+              : "+rw"(C[wiID])
+              : "rw"(A[wiID]), "rw"(B[wiID]));
+#else
+          C[wiID] = A[wiID];
+          C[wiID] *= B[wiID];
+#endif
+        });
+  }
+};
+
+int main() {
+  std::vector<dataType> inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE);
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) {
+    inputA[i] = i;
+    inputB[i] = 2;
+  }
+
+  KernelFunctor<> f(inputA, inputB);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  auto &C = f.getOutputBufferData();
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) {
+    if (C[i] != inputA[i] * inputB[i]) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << C[i] << " != " << inputA[i] * inputB[i] << "\n";
+      return 1;
+    }
+  }
+  return 0;
+}
diff --git a/sycl/test/feature-tests/inline-asm/asm_float_add.cpp b/sycl/test/feature-tests/inline-asm/asm_float_add.cpp
new file mode 100644
index 0000000000000..c23b084317c5b
--- /dev/null
+++ b/sycl/test/feature-tests/inline-asm/asm_float_add.cpp
@@ -0,0 +1,59 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <cmath>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_double;
+
+template <typename T = dataType>
+struct KernelFunctor : WithInputBuffers<T, 2>, WithOutputBuffer<T> {
+  KernelFunctor(const std::vector<T> &input1, const std::vector<T> &input2) : WithInputBuffers<T, 2>(input1, input2), WithOutputBuffer<T>(input1.size()) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto A = this->getInputBuffer(0).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto B = this->getInputBuffer(1).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("add (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>"
+              : "=rw"(C[wiID])
+              : "rw"(A[wiID]), "rw"(B[wiID]));
+#else
+          C[wiID] = A[wiID] + B[wiID];
+#endif
+        });
+  }
+};
+
+int main() {
+  std::vector<dataType> inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE);
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) {
+    inputA[i] = (double)1 / std::pow(2, i);
+    inputB[i] = (double)2 / std::pow(2, i);
+  }
+
+  KernelFunctor<> f(inputA, inputB);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  auto &C = f.getOutputBufferData();
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) {
+    if (C[i] != inputA[i] + inputB[i]) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << C[i] << " != " << inputA[i] + inputB[i] << "\n";
+      return 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/sycl/test/feature-tests/inline-asm/asm_float_imm_arg.cpp b/sycl/test/feature-tests/inline-asm/asm_float_imm_arg.cpp
new file mode 100644
index 0000000000000..c9683cf020f7e
--- /dev/null
+++ b/sycl/test/feature-tests/inline-asm/asm_float_imm_arg.cpp
@@ -0,0 +1,56 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <cmath>
+#include <iostream>
+#include <vector>
+
+constexpr double IMM_ARGUMENT = 0.5;
+using dataType = cl::sycl::cl_double;
+
+template <typename T = dataType>
+struct KernelFunctor : WithInputBuffers<T, 1>, WithOutputBuffer<T> {
+  KernelFunctor(const std::vector<T> &input) : WithInputBuffers<T, 1>(input), WithOutputBuffer<T>(input.size()) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto A = this->getInputBuffer(0).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto B = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("mul (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2"
+              : "=rw"(B[wiID])
+              : "rw"(A[wiID]), "rw"(IMM_ARGUMENT));
+#else
+          B[wiID] = A[wiID] * IMM_ARGUMENT;
+#endif
+        });
+  }
+};
+
+int main() {
+  std::vector<dataType> input(DEFAULT_PROBLEM_SIZE);
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++)
+    input[i] = (double)1 / std::pow(2, i);
+
+  KernelFunctor<> f(input);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  auto &B = f.getOutputBufferData();
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) {
+    if (B[i] != input[i] * IMM_ARGUMENT) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << B[i] << " != " << input[i] * IMM_ARGUMENT << "\n";
+      return 1;
+    }
+  }
+  return 0;
+}
diff --git a/sycl/test/feature-tests/inline-asm/asm_float_neg.cpp b/sycl/test/feature-tests/inline-asm/asm_float_neg.cpp
new file mode 100644
index 0000000000000..290b0898903ef
--- /dev/null
+++ b/sycl/test/feature-tests/inline-asm/asm_float_neg.cpp
@@ -0,0 +1,57 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_float;
+
+template <typename T = dataType>
+struct KernelFunctor : WithInputBuffers<T, 1>, WithOutputBuffer<T> {
+  KernelFunctor(const std::vector<T> &input) : WithInputBuffers<T, 1>(input), WithOutputBuffer<T>(input.size()) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto A = this->getInputBuffer().template get_access<cl::sycl::access::mode::read>(cgh);
+    auto B = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("mov (M1, 8) %0(0, 0)<1> (-)%1(0, 0)<1;1,0>"
+              : "=rw"(B[wiID])
+              : "rw"(A[wiID]));
+#else
+          B[wiID] = -A[wiID];
+#endif
+        });
+  }
+
+  size_t problem_size = 0;
+};
+
+int main() {
+  std::vector<dataType> input(DEFAULT_PROBLEM_SIZE);
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++)
+    input[i] = 1.0 / i;
+
+  KernelFunctor<> f(input);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  auto &R = f.getOutputBufferData();
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) {
+    if (R[i] != -input[i]) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << R[i] << " != " << -input[i] << "\n";
+      return 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/sycl/test/feature-tests/inline-asm/asm_imm_arg.cpp b/sycl/test/feature-tests/inline-asm/asm_imm_arg.cpp
new file mode 100644
index 0000000000000..2dba04d1179bb
--- /dev/null
+++ b/sycl/test/feature-tests/inline-asm/asm_imm_arg.cpp
@@ -0,0 +1,55 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+constexpr int CONST_ARGUMENT = 0xabc;
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithInputBuffers<T, 1>, WithOutputBuffer<T> {
+  KernelFunctor(const std::vector<T> &input) : WithInputBuffers<T, 1>(input), WithOutputBuffer<T>(input.size()) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto A = this->getInputBuffer(0).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto B = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("add (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2"
+              : "=rw"(B[wiID])
+              : "rw"(A[wiID]), "rw"(CONST_ARGUMENT));
+#else
+          B[wiID] = A[wiID] + CONST_ARGUMENT;
+#endif
+        });
+  }
+};
+
+int main() {
+  std::vector<dataType> input(DEFAULT_PROBLEM_SIZE);
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++)
+    input[i] = i;
+
+  KernelFunctor<> f(input);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  auto &B = f.getOutputBufferData();
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) {
+    if (B[i] != input[i] + CONST_ARGUMENT) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << B[i] << " != " << input[i] + CONST_ARGUMENT << "\n";
+      return 1;
+    }
+  }
+  return 0;
+}
diff --git a/sycl/test/feature-tests/inline-asm/asm_mul.cpp b/sycl/test/feature-tests/inline-asm/asm_mul.cpp
new file mode 100644
index 0000000000000..726abcf787f21
--- /dev/null
+++ b/sycl/test/feature-tests/inline-asm/asm_mul.cpp
@@ -0,0 +1,57 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithInputBuffers<T, 2>, WithOutputBuffer<T> {
+  KernelFunctor(const std::vector<T> &input1, const std::vector<T> &input2) : WithInputBuffers<T, 2>(input1, input2), WithOutputBuffer<T>(input1.size()) {}
+  void operator()(cl::sycl::handler &cgh) {
+    auto A = this->getInputBuffer(0).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto B = this->getInputBuffer(1).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("mul (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>"
+              : "=rw"(C[wiID])
+              : "rw"(A[wiID]), "rw"(B[wiID]));
+#else
+          C[wiID] = A[wiID] * B[wiID];
+#endif
+        });
+  }
+};
+
+int main() {
+  std::vector<dataType> inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE);
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) {
+    inputA[i] = i;
+    inputB[i] = DEFAULT_PROBLEM_SIZE - i;
+  }
+
+  KernelFunctor<> f(inputA, inputB);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  auto &C = f.getOutputBufferData();
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) {
+    if (C[i] != inputA[i] * inputB[i]) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << C[i] << " != " << inputA[i] * inputB[i] << "\n";
+      return 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/sycl/test/feature-tests/inline-asm/asm_multiple_instructions.cpp b/sycl/test/feature-tests/inline-asm/asm_multiple_instructions.cpp
new file mode 100644
index 0000000000000..e8cf02a5292e4
--- /dev/null
+++ b/sycl/test/feature-tests/inline-asm/asm_multiple_instructions.cpp
@@ -0,0 +1,59 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithInputBuffers<T, 3>, WithOutputBuffer<T> {
+  KernelFunctor(const std::vector<T> &input1, const std::vector<T> &input2, const std::vector<T> &input3) : WithInputBuffers<T, 3>(input1, input2, input3), WithOutputBuffer<T>(input1.size()) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto A = this->getInputBuffer(0).template get_access<cl::sycl::access::mode::read_write>(cgh);
+    auto B = this->getInputBuffer(1).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto C = this->getInputBuffer(2).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto D = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("{\n"
+              "add (M1, 8) %1(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>\n"
+              "add (M1, 8) %1(0, 0)<1> %1(0, 0)<1;1,0> %3(0, 0)<1;1,0>\n"
+              "mov (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0>\n"
+              "}\n"
+              : "=rw"(D[wiID]), "+rw"(A[wiID])
+              : "rw"(B[wiID]), "rw"(C[wiID]));
+#else
+          A[wiID] += B[wiID];
+          A[wiID] += C[wiID];
+          D[wiID] = A[wiID];
+#endif
+        });
+  }
+};
+
+int main() {
+  std::vector<dataType> inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE), inputC(DEFAULT_PROBLEM_SIZE);
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) {
+    inputA[i] = inputB[i] = i;
+    inputC[i] = DEFAULT_PROBLEM_SIZE - 2 * i; // A[i] + B[i] + C[i] = LIST_SIZE
+  }
+
+  KernelFunctor<> f(inputA, inputB, inputC);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  if (verify_all_the_same(f.getOutputBufferData(), (dataType)DEFAULT_PROBLEM_SIZE))
+    return 0;
+
+  return 1;
+}
diff --git a/sycl/test/feature-tests/inline-asm/asm_no_operands.cpp b/sycl/test/feature-tests/inline-asm/asm_no_operands.cpp
new file mode 100644
index 0000000000000..3a3a919caa982
--- /dev/null
+++ b/sycl/test/feature-tests/inline-asm/asm_no_operands.cpp
@@ -0,0 +1,34 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+class no_operands_kernel;
+
+int main() {
+  // Creating SYCL queue
+  cl::sycl::queue Queue;
+  cl::sycl::device Device = Queue.get_device();
+
+  if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  // Size of index space for kernel
+  cl::sycl::range<1> NumOfWorkItems{16};
+
+  // Submitting command group(work) to queue
+  Queue.submit([&](cl::sycl::handler &cgh) {
+    // Executing kernel
+    cgh.parallel_for<no_operands_kernel>(
+        NumOfWorkItems, [=](cl::sycl::id<1> WIid) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("barrier");
+#endif
+        });
+  });
+}
diff --git a/sycl/test/feature-tests/inline-asm/asm_no_output.cpp b/sycl/test/feature-tests/inline-asm/asm_no_output.cpp
new file mode 100644
index 0000000000000..ff6c65d48b31d
--- /dev/null
+++ b/sycl/test/feature-tests/inline-asm/asm_no_output.cpp
@@ -0,0 +1,47 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+          volatile int local_var = 47;
+          local_var += C[0];
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm volatile("{\n"
+                       ".decl temp v_type=G type=w num_elts=8 align=GRF\n"
+                       "mov (M1,16) temp(0, 0)<1> %0(0,0)<1;1,0>\n"
+                       "}\n" ::"rw"(local_var));
+#else
+          volatile int temp = 0;
+          temp = local_var;
+#endif
+        });
+  }
+};
+
+int main() {
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  if (verify_all_the_same(f.getOutputBufferData(), 0))
+    return 0;
+
+  return 1;
+}
diff --git a/sycl/test/feature-tests/inline-asm/asm_plus_mod.cpp b/sycl/test/feature-tests/inline-asm/asm_plus_mod.cpp
new file mode 100644
index 0000000000000..f65cda777ef9f
--- /dev/null
+++ b/sycl/test/feature-tests/inline-asm/asm_plus_mod.cpp
@@ -0,0 +1,58 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithInputBuffers<T, 1>, WithOutputBuffer<T> {
+  KernelFunctor(const std::vector<T> &input1, const std::vector<T> &input2) : WithInputBuffers<T, 1>(input1), WithOutputBuffer<T>(input2) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto A = this->getInputBuffer(0).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto B = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("add (M1, 16) %0(0, 0)<1> %0(0, 0)<1;1,0> %1(0, 0)<1;1,0>"
+              : "+rw"(B[wiID])
+              : "rw"(A[wiID]));
+#else
+          B[wiID] += A[wiID];
+#endif
+        });
+  }
+};
+
+int main() {
+  std::vector<dataType> inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE), R(DEFAULT_PROBLEM_SIZE);
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) {
+    inputA[i] = i;
+    inputB[i] = DEFAULT_PROBLEM_SIZE - i;
+    R[i] = inputA[i] + inputB[i];
+  }
+
+  KernelFunctor<> f(inputA, inputB);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  auto &B = f.getOutputBufferData();
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) {
+    if (B[i] != R[i]) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << B[i] << " != " << R[i] << "\n";
+      return 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/sycl/test/feature-tests/inline-asm/include/asmhelper.h b/sycl/test/feature-tests/inline-asm/include/asmhelper.h
new file mode 100644
index 0000000000000..75585e1611331
--- /dev/null
+++ b/sycl/test/feature-tests/inline-asm/include/asmhelper.h
@@ -0,0 +1,128 @@
+#include <CL/sycl.hpp>
+
+#include <iostream>
+#include <memory>
+#include <vector>
+
+constexpr const size_t DEFAULT_PROBLEM_SIZE = 16;
+
+template <typename T>
+struct WithOutputBuffer {
+  WithOutputBuffer(size_t size) {
+    _output_buffer_data.resize(size);
+    _output_buffer.reset(new cl::sycl::buffer<T>(_output_buffer_data.data(), _output_buffer_data.size()));
+  }
+
+  WithOutputBuffer(const std::vector<T> &data) {
+    _output_buffer_data = data;
+    _output_buffer.reset(new cl::sycl::buffer<T>(_output_buffer_data.data(), _output_buffer_data.size()));
+  }
+
+  const std::vector<T> &getOutputBufferData() {
+    // We cannoe access the data until the buffer is still alive
+    _output_buffer.reset();
+    return _output_buffer_data;
+  }
+
+  size_t getOutputBufferSize() const {
+    return _output_buffer_data.size();
+  }
+
+protected:
+  cl::sycl::buffer<T> &getOutputBuffer() {
+    return *_output_buffer;
+  }
+
+  // Functor is being passed by-copy into cl::sycl::queue::submit and destroyed
+  // one more time in there. We need to make sure that buffer is only released
+  // once.
+  std::shared_ptr<cl::sycl::buffer<T>> _output_buffer = nullptr;
+  std::vector<T> _output_buffer_data;
+};
+
+template <typename T, size_t N>
+struct WithInputBuffers {
+
+  template <typename... Args>
+  WithInputBuffers(Args... inputs) {
+    static_assert(sizeof...(Args) == N, "All input buffers must be initialized");
+    constructorHelper<0>(inputs...);
+  }
+
+  cl::sycl::buffer<T> &getInputBuffer(size_t i = 0) {
+    return *_input_buffers[i];
+  }
+
+protected:
+  std::shared_ptr<cl::sycl::buffer<T>> _input_buffers[N] = {nullptr};
+  std::vector<T> _input_buffers_data[N];
+
+private:
+  template <int Index, typename... Args>
+  void constructorHelper(const std::vector<T> &data, Args... rest) {
+    _input_buffers_data[Index] = data;
+    _input_buffers[Index].reset(new cl::sycl::buffer<T>(_input_buffers_data[Index].data(), _input_buffers_data[Index].size()));
+    constructorHelper<Index + 1>(rest...);
+  }
+
+  template <int Index>
+  void constructorHelper() {
+    // nothing to do, recursion stop
+  }
+};
+
+bool isInlineASMSupported(sycl::device Device) {
+
+  sycl::string_class DriverVersion = Device.get_info<sycl::info::device::driver_version>();
+  sycl::string_class DeviceVendorName = Device.get_info<sycl::info::device::vendor>();
+  // TODO: query for some extension/capability/whatever once interface is
+  // defined
+  if (DeviceVendorName.find("Intel") == sycl::string_class::npos)
+    return false;
+  if (DriverVersion.length() < 5)
+    return false;
+  if (DriverVersion[2] != '.')
+    return false;
+  if (std::stoi(DriverVersion.substr(0, 2), nullptr, 10) < 20 || std::stoi(DriverVersion.substr(3, 2), nullptr, 10) < 12)
+    return false;
+  return true;
+}
+
+/// checks if device suppots inline asm feature and launches a test
+///
+/// \returns false if test wasn't launched (i.e.was skipped) and true otherwise
+template <typename F>
+bool launchInlineASMTest(F &f, bool requires_particular_sg_size = true) {
+  try {
+    cl::sycl::queue deviceQueue(cl::sycl::gpu_selector{});
+    cl::sycl::device device = deviceQueue.get_device();
+
+#if defined(INLINE_ASM)
+    if (!isInlineASMSupported(device)) {
+      std::cout << "Skipping test\n";
+      return false;
+    }
+#endif
+
+    if (requires_particular_sg_size && !device.has_extension("cl_intel_required_subgroup_size")) {
+      std::cout << "Skipping test\n";
+      return false;
+    }
+
+    deviceQueue.submit(f).wait();
+  } catch (cl::sycl::exception &e) {
+    std::cerr << "Caught exception: " << e.what() << std::endl;
+  }
+  return true;
+}
+
+template <typename T>
+bool verify_all_the_same(const std::vector<T> &input, T reference_value) {
+  for (int i = 0; i < input.size(); ++i)
+    if (input[i] != reference_value) {
+      std::cerr << "At index: " << i << " ";
+      std::cerr << input[i] << " != " << reference_value << "\n";
+      return false;
+    }
+  return true;
+}
diff --git a/sycl/test/feature-tests/inline-asm/letter_example.cpp b/sycl/test/feature-tests/inline-asm/letter_example.cpp
new file mode 100644
index 0000000000000..22bf26648e78b
--- /dev/null
+++ b/sycl/test/feature-tests/inline-asm/letter_example.cpp
@@ -0,0 +1,66 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+
+constexpr size_t problem_size = 16;
+
+class kernel_name;
+
+int main() {
+  cl::sycl::queue q;
+  cl::sycl::device Device = q.get_device();
+
+  if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  auto ctx = q.get_context();
+  int *a = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx);
+  for (int i = 0; i < problem_size; i++) {
+    a[i] = i;
+  }
+  q.submit([&](cl::sycl::handler &cgh) {
+     cgh.parallel_for<kernel_name>(
+         cl::sycl::range<1>(problem_size), [=](cl::sycl::id<1> idx)
+                                               [[cl::intel_reqd_sub_group_size(16)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+                                                 int i = idx[0];
+                                                 asm volatile("{\n.decl V52 v_type=G type=d num_elts=16 align=GRF\n"
+                                                              "svm_gather.4.1 (M1, 16) %0.0 V52.0\n"
+                                                              "add(M1, 16) V52(0, 0)<1> V52(0, 0)<1; 1, 0> 0x1:w\n"
+                                                              "svm_scatter.4.1 (M1, 16) %0.0 V52.0\n}"
+                                                              :
+                                                              : "rw"(&a[i]));
+#else
+                                                 a[idx[0]]++;
+#endif
+                                               });
+   }).wait();
+
+  bool currect = true;
+  for (int i = 0; i < problem_size; i++) {
+    if (a[i] != (i + 1)) {
+      currect = false;
+      std::cerr << "error in a[" << i << "]="
+                << a[i] << "!=" << (i + 1) << std::endl;
+      break;
+    }
+  }
+
+  if (!currect) {
+    std::cerr << "Error" << std::endl;
+    cl::sycl::free(a, ctx);
+    return 1;
+  }
+
+  std::cerr << "Pass" << std::endl;
+  cl::sycl::free(a, ctx);
+  return 0;
+}
diff --git a/sycl/test/feature-tests/inline-asm/malloc_shared_32.cpp b/sycl/test/feature-tests/inline-asm/malloc_shared_32.cpp
new file mode 100644
index 0000000000000..8f058851c2681
--- /dev/null
+++ b/sycl/test/feature-tests/inline-asm/malloc_shared_32.cpp
@@ -0,0 +1,92 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+
+constexpr size_t problem_size = 32;
+
+class kernel_name;
+
+int main() {
+  cl::sycl::queue q;
+
+  cl::sycl::device Device = q.get_device();
+
+  if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  auto ctx = q.get_context();
+  int *a = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx);
+  int *b = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx);
+  int *c = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx);
+  for (int i = 0; i < problem_size; i++) {
+    b[i] = -10;
+    a[i] = i;
+    c[i] = i;
+  }
+
+  q.submit([&](cl::sycl::handler &cgh) {
+     cgh.parallel_for<kernel_name>(
+         cl::sycl::range<1>(problem_size),
+         [=](cl::sycl::id<1> idx)
+             [[cl::intel_reqd_sub_group_size(32)]] {
+               int i = idx[0];
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+               asm volatile(R"a(
+    {
+        .decl V52 v_type=G type=d num_elts=16 align=GRF
+        .decl V53 v_type=G type=d num_elts=16 align=GRF
+        .decl V54 v_type=G type=d num_elts=16 align=GRF
+        .decl V55 v_type=G type=d num_elts=16 align=GRF
+        .decl V56 v_type=G type=d num_elts=16 align=GRF
+        .decl V57 v_type=G type=d num_elts=16 align=GRF
+        svm_gather.4.1 (M1, 16) %2.0 V54.0
+        svm_gather.4.1 (M1, 16) %3.0 V55.0
+        svm_gather.4.1 (M1, 16) %4.0 V56.0
+        svm_gather.4.1 (M1, 16) %5.0 V57.0
+        mul (M1, 16) V52(0,0)<1> V54(0,0)<1;1,0> V56(0,0)<1;1,0>
+        mul (M1, 16) V53(0,0)<1> V55(0,0)<1;1,0> V57(0,0)<1;1,0>
+        svm_scatter.4.1 (M1, 16) %0.0 V52.0
+        svm_scatter.4.1 (M1, 16) %1.0 V53.0
+    }
+    )a" ::"rw"(&b[i]),
+                            "rw"(&b[i] + 16), "rw"(&a[i]), "rw"(&a[i] + 16), "rw"(&c[i]),
+                            "rw"(&c[i] + 16));
+#else
+               b[i] = a[i] * c[i];
+#endif
+             });
+   }).wait();
+
+  bool currect = true;
+  for (int i = 0; i < problem_size; i++) {
+    if (b[i] != a[i] * b[i]) {
+      currect = false;
+      std::cerr << "error in a[" << i << "]="
+                << b[i] << "!=" << a[i] * b[i] << std::endl;
+      break;
+    }
+  }
+
+  if (!currect) {
+    std::cerr << "Error" << std::endl;
+    cl::sycl::free(a, ctx);
+    cl::sycl::free(b, ctx);
+    cl::sycl::free(c, ctx);
+    return 1;
+  }
+
+  std::cerr << "Pass" << std::endl;
+  cl::sycl::free(a, ctx);
+  cl::sycl::free(b, ctx);
+  cl::sycl::free(c, ctx);
+  return 0;
+}
diff --git a/sycl/test/feature-tests/inline-asm/malloc_shared_in_out_dif.cpp b/sycl/test/feature-tests/inline-asm/malloc_shared_in_out_dif.cpp
new file mode 100644
index 0000000000000..a6994bd379199
--- /dev/null
+++ b/sycl/test/feature-tests/inline-asm/malloc_shared_in_out_dif.cpp
@@ -0,0 +1,69 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+
+constexpr size_t problem_size = 100;
+
+class kernel_name;
+
+int main() {
+  cl::sycl::queue q;
+
+  cl::sycl::device Device = q.get_device();
+
+  if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  auto ctx = q.get_context();
+  int *a = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx);
+  int *b = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx);
+  for (int i = 0; i < problem_size; i++) {
+    b[i] = -1;
+    a[i] = i;
+  }
+
+  q.submit([&](cl::sycl::handler &cgh) {
+     cgh.parallel_for<kernel_name>(
+         cl::sycl::range<1>(problem_size), [=](cl::sycl::id<1> idx) [[cl::intel_reqd_sub_group_size(16)]] {
+           int i = idx[0];
+           volatile int tmp = a[i];
+           tmp += 1;
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+           asm volatile(" add (M1, 16) %0(0,0)<1> %0(0,0)<1;1,0> %1(0,0)<1;1,0>"
+                        : "+rw"(b[i])
+                        : "rw"(tmp));
+#else
+           b[i] += tmp;
+#endif
+         });
+   }).wait();
+
+  bool currect = true;
+  for (int i = 0; i < problem_size; i++) {
+    if (b[i] != a[i]) {
+      currect = false;
+      std::cerr << "error in a[" << i << "]="
+                << b[i] << "!=" << a[i] << std::endl;
+      break;
+    }
+  }
+
+  if (!currect) {
+    std::cerr << "Error" << std::endl;
+    cl::sycl::free(a, ctx);
+    cl::sycl::free(b, ctx);
+    return 1;
+  }
+
+  std::cerr << "Pass" << std::endl;
+  cl::sycl::free(a, ctx);
+  cl::sycl::free(b, ctx);
+  return 0;
+}
diff --git a/sycl/test/feature-tests/inline-asm/malloc_shared_no_input.cpp b/sycl/test/feature-tests/inline-asm/malloc_shared_no_input.cpp
new file mode 100644
index 0000000000000..22cd47abd64ab
--- /dev/null
+++ b/sycl/test/feature-tests/inline-asm/malloc_shared_no_input.cpp
@@ -0,0 +1,61 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+
+constexpr size_t problem_size = 16;
+
+class kernel_name;
+
+int main() {
+  cl::sycl::queue q;
+  cl::sycl::device Device = q.get_device();
+
+  if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  auto ctx = q.get_context();
+  int *a = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx);
+  for (int i = 0; i < problem_size; i++)
+    a[i] = i;
+
+  q.submit([&](cl::sycl::handler &cgh) {
+     cgh.parallel_for<kernel_name>(
+         cl::sycl::range<1>(problem_size), [=](cl::sycl::id<1> idx) [[cl::intel_reqd_sub_group_size(16)]] {
+           int i = idx[0];
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+           asm volatile("mov (M1, 16) %0(0,0)<1> 0x7:d"
+                        : "=rw"(a[i]));
+#else
+           a[i] = 7;
+#endif
+         });
+   }).wait();
+
+  bool currect = true;
+  for (int i = 0; i < problem_size; i++) {
+    if (a[i] != 7) {
+      currect = false;
+      std::cerr << "error in a[" << i << "]="
+                << a[i] << "!=" << 7 << std::endl;
+      break;
+    }
+  }
+
+  if (!currect) {
+    std::cerr << "Error" << std::endl;
+    cl::sycl::free(a, ctx);
+    return 1;
+  }
+
+  std::cerr << "Pass" << std::endl;
+  cl::sycl::free(a, ctx);
+  return 0;
+}
diff --git a/sycl/test/lit.cfg.py b/sycl/test/lit.cfg.py
index e6ae419047b9d..6285c44d27632 100644
--- a/sycl/test/lit.cfg.py
+++ b/sycl/test/lit.cfg.py
@@ -26,7 +26,8 @@
 # suffixes: A list of file extensions to treat as test files.
 config.suffixes = ['.c', '.cpp'] #add .spv. Currently not clear what to do with those
 
-config.excludes = ['Inputs']
+# feature tests are considered not so lightweight, so, they are excluded by default
+config.excludes = ['Inputs', 'feature-tests']
 
 # test_source_root: The root path where tests are located.
 config.test_source_root = os.path.dirname(__file__)