intel · bader · Apr 20, 2020 · Mar 26, 2020 · Apr 15, 2020 · Apr 15, 2020
@@ -61,6 +61,14 @@ add_lit_testsuites(SYCL-DEPLOY ${CMAKE_CURRENT_SOURCE_DIR}
   EXCLUDE_FROM_CHECK_ALL
   )
 
+add_lit_target(check-sycl-inline-asm
+  "Running lit suite ${CMAKE_CURRENT_SOURCE_DIR}/feature-tests/inline-asm"
+  "feature-tests/inline-asm"
+  ARGS ${RT_TEST_ARGS}
+  PARAMS "SYCL_BE=PI_OPENCL"
+  DEPENDS ${SYCL_TEST_DEPS}
+  )
+
 if(SYCL_BUILD_PI_CUDA)
   add_lit_testsuite(check-sycl-cuda "Running the SYCL regression tests for CUDA"
     ${CMAKE_CURRENT_BINARY_DIR}

@@ -0,0 +1,40 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
+          C[wiID] = 43;
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm volatile("");
+#endif
+        });
+  }
+};
+
+int main() {
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  if (verify_all_the_same(f.getOutputBufferData(), 43))
+    return 0;
+
+  return 1;
+}
@@ -0,0 +1,44 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
+          volatile int output = 0;
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d"
+                       : "=rw"(output));
+#else
+          output = 7;
+#endif
+          C[wiID] = output;
+        });
+  }
+};
+
+int main() {
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  if (verify_all_the_same(f.getOutputBufferData(), 7))
+    return 0;
+
+  return 1;
+}
@@ -0,0 +1,44 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
+          volatile int output = 0;
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d"
+                       : "=rw"(output));
+#else
+          output = 7;
+#endif
+          C[wiID] = output;
+        });
+  }
+};
+
+int main() {
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  if (verify_all_the_same(f.getOutputBufferData(), 7))
+    return 0;
+
+  return 1;
+}
@@ -0,0 +1,45 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
+          for (int i = 0; i < 10; ++i) {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+            asm("fence_sw");
+            C[wiID] += i;
+
+#else
+            C[wiID] += i;
+#endif
+          }
+        });
+  }
+};
+
+int main() {
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  if (verify_all_the_same(f.getOutputBufferData(), 45))
+    return 0;
+
+  return 1;
+}
@@ -0,0 +1,40 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+          C[wiID] = 43;
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm volatile("");
+#endif
+        });
+  }
+};
+
+int main() {
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  if (verify_all_the_same(f.getOutputBufferData(), 43))
+    return 0;
+
+  return 1;
+}
@@ -0,0 +1,44 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+          volatile int output = 0;
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm volatile("mov (M1,8) %0(0,0)<1> 0x7:d"
+                       : "=rw"(output));
+#else
+          output = 7;
+#endif
+          C[wiID] = output;
+        });
+  }
+};
+
+int main() {
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  if (verify_all_the_same(f.getOutputBufferData(), 7))
+    return 0;
+
+  return 1;
+}
@@ -0,0 +1,59 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
+
+#include "include/asmhelper.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+using dataType = cl::sycl::cl_int;
+
+template <typename T = dataType>
+struct KernelFunctor : WithInputBuffers<T, 3>, WithOutputBuffer<T> {
+  KernelFunctor(const std::vector<T> &input1, const std::vector<T> &input2, const std::vector<T> &input3) : WithInputBuffers<T, 3>(input1, input2, input3), WithOutputBuffer<T>(input1.size()) {}
+
+  void operator()(cl::sycl::handler &cgh) {
+    auto A = this->getInputBuffer(0).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto B = this->getInputBuffer(1).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto C = this->getInputBuffer(2).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto D = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("mad (M1, 8) %0(0, 0)<1> %3(0, 0)<1;1,0> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>"
+              : "=rw"(D[wiID])
+              : "rw"(B[wiID]), "rw"(C[wiID]), "rw"(A[wiID]));
+#else
+          D[wiID] = A[wiID] * B[wiID] + C[wiID];
+#endif
+        });
+  }
+};
+
+int main() {
+  std::vector<dataType> inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE), inputC(DEFAULT_PROBLEM_SIZE);
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) {
+    inputA[i] = i;
+    inputB[i] = i;
+    inputC[i] = DEFAULT_PROBLEM_SIZE - i * i;
+  }
+
+  KernelFunctor<> f(inputA, inputB, inputC);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  auto &D = f.getOutputBufferData();
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) {
+    if (D[i] != inputA[i] * inputB[i] + inputC[i]) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << D[i] << " != " << inputA[i] * inputB[i] + inputC[i] << "\n";
+      return 1;
+    }
+  }
+  return 0;
+}