diff --git a/sycl/test/CMakeLists.txt b/sycl/test/CMakeLists.txt index a152c4b34137e..0fc16451d8fc5 100644 --- a/sycl/test/CMakeLists.txt +++ b/sycl/test/CMakeLists.txt @@ -61,6 +61,14 @@ add_lit_testsuites(SYCL-DEPLOY ${CMAKE_CURRENT_SOURCE_DIR} EXCLUDE_FROM_CHECK_ALL ) +add_lit_target(check-sycl-inline-asm + "Running lit suite ${CMAKE_CURRENT_SOURCE_DIR}/feature-tests/inline-asm" + "feature-tests/inline-asm" + ARGS ${RT_TEST_ARGS} + PARAMS "SYCL_BE=PI_OPENCL" + DEPENDS ${SYCL_TEST_DEPS} + ) + if(SYCL_BUILD_PI_CUDA) add_lit_testsuite(check-sycl-cuda "Running the SYCL regression tests for CUDA" ${CMAKE_CURRENT_BINARY_DIR} diff --git a/sycl/test/feature-tests/inline-asm/asm_16_empty.cpp b/sycl/test/feature-tests/inline-asm/asm_16_empty.cpp new file mode 100644 index 0000000000000..ad4285e8ecbdb --- /dev/null +++ b/sycl/test/feature-tests/inline-asm/asm_16_empty.cpp @@ -0,0 +1,40 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include + +using dataType = cl::sycl::cl_int; + +template +struct KernelFunctor : WithOutputBuffer { + KernelFunctor(size_t problem_size) : WithOutputBuffer(problem_size) {} + + void operator()(cl::sycl::handler &cgh) { + auto C = this->getOutputBuffer().template get_access(cgh); + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] { + C[wiID] = 43; +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm volatile(""); +#endif + }); + } +}; + +int main() { + KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); + if (!launchInlineASMTest(f)) + return 0; + + if (verify_all_the_same(f.getOutputBufferData(), 43)) + return 0; + + return 1; +} diff --git a/sycl/test/feature-tests/inline-asm/asm_16_matrix_mult.cpp b/sycl/test/feature-tests/inline-asm/asm_16_matrix_mult.cpp new file mode 100644 index 0000000000000..6ae1debb67486 --- /dev/null +++ b/sycl/test/feature-tests/inline-asm/asm_16_matrix_mult.cpp @@ -0,0 +1,44 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include + +using dataType = cl::sycl::cl_int; + +template +struct KernelFunctor : WithOutputBuffer { + KernelFunctor(size_t problem_size) : WithOutputBuffer(problem_size) {} + + void operator()(cl::sycl::handler &cgh) { + auto C = this->getOutputBuffer().template get_access(cgh); + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] { + volatile int output = 0; +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d" + : "=rw"(output)); +#else + output = 7; +#endif + C[wiID] = output; + }); + } +}; + +int main() { + KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); + if (!launchInlineASMTest(f)) + return 0; + + if (verify_all_the_same(f.getOutputBufferData(), 7)) + return 0; + + return 1; +} diff --git a/sycl/test/feature-tests/inline-asm/asm_16_no_input_int.cpp b/sycl/test/feature-tests/inline-asm/asm_16_no_input_int.cpp new file mode 100644 index 0000000000000..6ae1debb67486 --- /dev/null +++ b/sycl/test/feature-tests/inline-asm/asm_16_no_input_int.cpp @@ -0,0 +1,44 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include + +using dataType = cl::sycl::cl_int; + +template +struct KernelFunctor : WithOutputBuffer { + KernelFunctor(size_t problem_size) : WithOutputBuffer(problem_size) {} + + void operator()(cl::sycl::handler &cgh) { + auto C = this->getOutputBuffer().template get_access(cgh); + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] { + volatile int output = 0; +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d" + : "=rw"(output)); +#else + output = 7; +#endif + C[wiID] = output; + }); + } +}; + +int main() { + KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); + if (!launchInlineASMTest(f)) + return 0; + + if (verify_all_the_same(f.getOutputBufferData(), 7)) + return 0; + + return 1; +} diff --git a/sycl/test/feature-tests/inline-asm/asm_16_no_opts.cpp b/sycl/test/feature-tests/inline-asm/asm_16_no_opts.cpp new file mode 100644 index 0000000000000..4b6d5146fd6b8 --- /dev/null +++ b/sycl/test/feature-tests/inline-asm/asm_16_no_opts.cpp @@ -0,0 +1,45 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include + +using dataType = cl::sycl::cl_int; + +template +struct KernelFunctor : WithOutputBuffer { + KernelFunctor(size_t problem_size) : WithOutputBuffer(problem_size) {} + + void operator()(cl::sycl::handler &cgh) { + auto C = this->getOutputBuffer().template get_access(cgh); + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] { + for (int i = 0; i < 10; ++i) { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("fence_sw"); + C[wiID] += i; + +#else + C[wiID] += i; +#endif + } + }); + } +}; + +int main() { + KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); + if (!launchInlineASMTest(f)) + return 0; + + if (verify_all_the_same(f.getOutputBufferData(), 45)) + return 0; + + return 1; +} diff --git a/sycl/test/feature-tests/inline-asm/asm_8_empty.cpp b/sycl/test/feature-tests/inline-asm/asm_8_empty.cpp new file mode 100644 index 0000000000000..97fae0ed4eb27 --- /dev/null +++ b/sycl/test/feature-tests/inline-asm/asm_8_empty.cpp @@ -0,0 +1,40 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include + +using dataType = cl::sycl::cl_int; + +template +struct KernelFunctor : WithOutputBuffer { + KernelFunctor(size_t problem_size) : WithOutputBuffer(problem_size) {} + + void operator()(cl::sycl::handler &cgh) { + auto C = this->getOutputBuffer().template get_access(cgh); + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { + C[wiID] = 43; +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm volatile(""); +#endif + }); + } +}; + +int main() { + KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); + if (!launchInlineASMTest(f)) + return 0; + + if (verify_all_the_same(f.getOutputBufferData(), 43)) + return 0; + + return 1; +} diff --git a/sycl/test/feature-tests/inline-asm/asm_8_no_input_int.cpp b/sycl/test/feature-tests/inline-asm/asm_8_no_input_int.cpp new file mode 100644 index 0000000000000..6d1dcbb832cf2 --- /dev/null +++ b/sycl/test/feature-tests/inline-asm/asm_8_no_input_int.cpp @@ -0,0 +1,44 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include + +using dataType = cl::sycl::cl_int; + +template +struct KernelFunctor : WithOutputBuffer { + KernelFunctor(size_t problem_size) : WithOutputBuffer(problem_size) {} + + void operator()(cl::sycl::handler &cgh) { + auto C = this->getOutputBuffer().template get_access(cgh); + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { + volatile int output = 0; +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm volatile("mov (M1,8) %0(0,0)<1> 0x7:d" + : "=rw"(output)); +#else + output = 7; +#endif + C[wiID] = output; + }); + } +}; + +int main() { + KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); + if (!launchInlineASMTest(f)) + return 0; + + if (verify_all_the_same(f.getOutputBufferData(), 7)) + return 0; + + return 1; +} diff --git a/sycl/test/feature-tests/inline-asm/asm_arbitrary_ops_order.cpp b/sycl/test/feature-tests/inline-asm/asm_arbitrary_ops_order.cpp new file mode 100644 index 0000000000000..28d0af1d455b6 --- /dev/null +++ b/sycl/test/feature-tests/inline-asm/asm_arbitrary_ops_order.cpp @@ -0,0 +1,59 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include + +using dataType = cl::sycl::cl_int; + +template +struct KernelFunctor : WithInputBuffers, WithOutputBuffer { + KernelFunctor(const std::vector &input1, const std::vector &input2, const std::vector &input3) : WithInputBuffers(input1, input2, input3), WithOutputBuffer(input1.size()) {} + + void operator()(cl::sycl::handler &cgh) { + auto A = this->getInputBuffer(0).template get_access(cgh); + auto B = this->getInputBuffer(1).template get_access(cgh); + auto C = this->getInputBuffer(2).template get_access(cgh); + auto D = this->getOutputBuffer().template get_access(cgh); + + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("mad (M1, 8) %0(0, 0)<1> %3(0, 0)<1;1,0> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>" + : "=rw"(D[wiID]) + : "rw"(B[wiID]), "rw"(C[wiID]), "rw"(A[wiID])); +#else + D[wiID] = A[wiID] * B[wiID] + C[wiID]; +#endif + }); + } +}; + +int main() { + std::vector inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE), inputC(DEFAULT_PROBLEM_SIZE); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) { + inputA[i] = i; + inputB[i] = i; + inputC[i] = DEFAULT_PROBLEM_SIZE - i * i; + } + + KernelFunctor<> f(inputA, inputB, inputC); + if (!launchInlineASMTest(f)) + return 0; + + auto &D = f.getOutputBufferData(); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) { + if (D[i] != inputA[i] * inputB[i] + inputC[i]) { + std::cerr << "At index: " << i << ". "; + std::cerr << D[i] << " != " << inputA[i] * inputB[i] + inputC[i] << "\n"; + return 1; + } + } + return 0; +} diff --git a/sycl/test/feature-tests/inline-asm/asm_decl_in_scope.cpp b/sycl/test/feature-tests/inline-asm/asm_decl_in_scope.cpp new file mode 100644 index 0000000000000..db30e20f5e9ee --- /dev/null +++ b/sycl/test/feature-tests/inline-asm/asm_decl_in_scope.cpp @@ -0,0 +1,67 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include + +using dataType = cl::sycl::cl_int; + +template +struct KernelFunctor : WithInputBuffers, WithOutputBuffer { + KernelFunctor(const std::vector &input1, const std::vector &input2) : WithInputBuffers(input1, input2), WithOutputBuffer(input1.size()) {} + + void operator()(cl::sycl::handler &cgh) { + auto A = this->getInputBuffer(0).template get_access(cgh); + auto B = this->getInputBuffer(1).template get_access(cgh); + auto C = this->getOutputBuffer().template get_access(cgh); + + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, + [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] { + // declaration of temp within and outside the scope +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("{\n" + ".decl temp v_type=G type=d num_elts=16 align=GRF\n" + "mov (M1, 16) temp(0, 0)<1> %1(0, 0)<1;1,0>\n" + "mov (M1, 16) %0(0, 0)<1> temp(0, 0)<1;1,0>\n" + "}\n" + ".decl temp v_type=G type=d num_elts=16 align=GRF\n" + "mul (M1, 16) temp(0, 0)<1> %2(0, 0)<1;1,0> %0(0, 0)<1;1,0>\n" + "mov (M1, 16) %0(0, 0)<1> temp(0, 0)<1;1,0>\n" + : "+rw"(C[wiID]) + : "rw"(A[wiID]), "rw"(B[wiID])); +#else + C[wiID] = A[wiID]; + C[wiID] *= B[wiID]; +#endif + }); + } +}; + +int main() { + std::vector inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) { + inputA[i] = i; + inputB[i] = 2; + } + + KernelFunctor<> f(inputA, inputB); + if (!launchInlineASMTest(f)) + return 0; + + auto &C = f.getOutputBufferData(); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) { + if (C[i] != inputA[i] * inputB[i]) { + std::cerr << "At index: " << i << ". "; + std::cerr << C[i] << " != " << inputA[i] * inputB[i] << "\n"; + return 1; + } + } + return 0; +} diff --git a/sycl/test/feature-tests/inline-asm/asm_float_add.cpp b/sycl/test/feature-tests/inline-asm/asm_float_add.cpp new file mode 100644 index 0000000000000..c23b084317c5b --- /dev/null +++ b/sycl/test/feature-tests/inline-asm/asm_float_add.cpp @@ -0,0 +1,59 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include +#include + +using dataType = cl::sycl::cl_double; + +template +struct KernelFunctor : WithInputBuffers, WithOutputBuffer { + KernelFunctor(const std::vector &input1, const std::vector &input2) : WithInputBuffers(input1, input2), WithOutputBuffer(input1.size()) {} + + void operator()(cl::sycl::handler &cgh) { + auto A = this->getInputBuffer(0).template get_access(cgh); + auto B = this->getInputBuffer(1).template get_access(cgh); + auto C = this->getOutputBuffer().template get_access(cgh); + + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("add (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>" + : "=rw"(C[wiID]) + : "rw"(A[wiID]), "rw"(B[wiID])); +#else + C[wiID] = A[wiID] + B[wiID]; +#endif + }); + } +}; + +int main() { + std::vector inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) { + inputA[i] = (double)1 / std::pow(2, i); + inputB[i] = (double)2 / std::pow(2, i); + } + + KernelFunctor<> f(inputA, inputB); + if (!launchInlineASMTest(f)) + return 0; + + auto &C = f.getOutputBufferData(); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) { + if (C[i] != inputA[i] + inputB[i]) { + std::cerr << "At index: " << i << ". "; + std::cerr << C[i] << " != " << inputA[i] + inputB[i] << "\n"; + return 1; + } + } + + return 0; +} diff --git a/sycl/test/feature-tests/inline-asm/asm_float_imm_arg.cpp b/sycl/test/feature-tests/inline-asm/asm_float_imm_arg.cpp new file mode 100644 index 0000000000000..c9683cf020f7e --- /dev/null +++ b/sycl/test/feature-tests/inline-asm/asm_float_imm_arg.cpp @@ -0,0 +1,56 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include +#include + +constexpr double IMM_ARGUMENT = 0.5; +using dataType = cl::sycl::cl_double; + +template +struct KernelFunctor : WithInputBuffers, WithOutputBuffer { + KernelFunctor(const std::vector &input) : WithInputBuffers(input), WithOutputBuffer(input.size()) {} + + void operator()(cl::sycl::handler &cgh) { + auto A = this->getInputBuffer(0).template get_access(cgh); + auto B = this->getOutputBuffer().template get_access(cgh); + + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("mul (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2" + : "=rw"(B[wiID]) + : "rw"(A[wiID]), "rw"(IMM_ARGUMENT)); +#else + B[wiID] = A[wiID] * IMM_ARGUMENT; +#endif + }); + } +}; + +int main() { + std::vector input(DEFAULT_PROBLEM_SIZE); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) + input[i] = (double)1 / std::pow(2, i); + + KernelFunctor<> f(input); + if (!launchInlineASMTest(f)) + return 0; + + auto &B = f.getOutputBufferData(); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) { + if (B[i] != input[i] * IMM_ARGUMENT) { + std::cerr << "At index: " << i << ". "; + std::cerr << B[i] << " != " << input[i] * IMM_ARGUMENT << "\n"; + return 1; + } + } + return 0; +} diff --git a/sycl/test/feature-tests/inline-asm/asm_float_neg.cpp b/sycl/test/feature-tests/inline-asm/asm_float_neg.cpp new file mode 100644 index 0000000000000..290b0898903ef --- /dev/null +++ b/sycl/test/feature-tests/inline-asm/asm_float_neg.cpp @@ -0,0 +1,57 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include + +using dataType = cl::sycl::cl_float; + +template +struct KernelFunctor : WithInputBuffers, WithOutputBuffer { + KernelFunctor(const std::vector &input) : WithInputBuffers(input), WithOutputBuffer(input.size()) {} + + void operator()(cl::sycl::handler &cgh) { + auto A = this->getInputBuffer().template get_access(cgh); + auto B = this->getOutputBuffer().template get_access(cgh); + + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("mov (M1, 8) %0(0, 0)<1> (-)%1(0, 0)<1;1,0>" + : "=rw"(B[wiID]) + : "rw"(A[wiID])); +#else + B[wiID] = -A[wiID]; +#endif + }); + } + + size_t problem_size = 0; +}; + +int main() { + std::vector input(DEFAULT_PROBLEM_SIZE); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) + input[i] = 1.0 / i; + + KernelFunctor<> f(input); + if (!launchInlineASMTest(f)) + return 0; + + auto &R = f.getOutputBufferData(); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) { + if (R[i] != -input[i]) { + std::cerr << "At index: " << i << ". "; + std::cerr << R[i] << " != " << -input[i] << "\n"; + return 1; + } + } + + return 0; +} diff --git a/sycl/test/feature-tests/inline-asm/asm_imm_arg.cpp b/sycl/test/feature-tests/inline-asm/asm_imm_arg.cpp new file mode 100644 index 0000000000000..2dba04d1179bb --- /dev/null +++ b/sycl/test/feature-tests/inline-asm/asm_imm_arg.cpp @@ -0,0 +1,55 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include + +constexpr int CONST_ARGUMENT = 0xabc; +using dataType = cl::sycl::cl_int; + +template +struct KernelFunctor : WithInputBuffers, WithOutputBuffer { + KernelFunctor(const std::vector &input) : WithInputBuffers(input), WithOutputBuffer(input.size()) {} + + void operator()(cl::sycl::handler &cgh) { + auto A = this->getInputBuffer(0).template get_access(cgh); + auto B = this->getOutputBuffer().template get_access(cgh); + + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("add (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2" + : "=rw"(B[wiID]) + : "rw"(A[wiID]), "rw"(CONST_ARGUMENT)); +#else + B[wiID] = A[wiID] + CONST_ARGUMENT; +#endif + }); + } +}; + +int main() { + std::vector input(DEFAULT_PROBLEM_SIZE); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) + input[i] = i; + + KernelFunctor<> f(input); + if (!launchInlineASMTest(f)) + return 0; + + auto &B = f.getOutputBufferData(); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) { + if (B[i] != input[i] + CONST_ARGUMENT) { + std::cerr << "At index: " << i << ". "; + std::cerr << B[i] << " != " << input[i] + CONST_ARGUMENT << "\n"; + return 1; + } + } + return 0; +} diff --git a/sycl/test/feature-tests/inline-asm/asm_mul.cpp b/sycl/test/feature-tests/inline-asm/asm_mul.cpp new file mode 100644 index 0000000000000..726abcf787f21 --- /dev/null +++ b/sycl/test/feature-tests/inline-asm/asm_mul.cpp @@ -0,0 +1,57 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include + +using dataType = cl::sycl::cl_int; + +template +struct KernelFunctor : WithInputBuffers, WithOutputBuffer { + KernelFunctor(const std::vector &input1, const std::vector &input2) : WithInputBuffers(input1, input2), WithOutputBuffer(input1.size()) {} + void operator()(cl::sycl::handler &cgh) { + auto A = this->getInputBuffer(0).template get_access(cgh); + auto B = this->getInputBuffer(1).template get_access(cgh); + auto C = this->getOutputBuffer().template get_access(cgh); + + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("mul (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>" + : "=rw"(C[wiID]) + : "rw"(A[wiID]), "rw"(B[wiID])); +#else + C[wiID] = A[wiID] * B[wiID]; +#endif + }); + } +}; + +int main() { + std::vector inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) { + inputA[i] = i; + inputB[i] = DEFAULT_PROBLEM_SIZE - i; + } + + KernelFunctor<> f(inputA, inputB); + if (!launchInlineASMTest(f)) + return 0; + + auto &C = f.getOutputBufferData(); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) { + if (C[i] != inputA[i] * inputB[i]) { + std::cerr << "At index: " << i << ". "; + std::cerr << C[i] << " != " << inputA[i] * inputB[i] << "\n"; + return 1; + } + } + + return 0; +} diff --git a/sycl/test/feature-tests/inline-asm/asm_multiple_instructions.cpp b/sycl/test/feature-tests/inline-asm/asm_multiple_instructions.cpp new file mode 100644 index 0000000000000..e8cf02a5292e4 --- /dev/null +++ b/sycl/test/feature-tests/inline-asm/asm_multiple_instructions.cpp @@ -0,0 +1,59 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include + +using dataType = cl::sycl::cl_int; + +template +struct KernelFunctor : WithInputBuffers, WithOutputBuffer { + KernelFunctor(const std::vector &input1, const std::vector &input2, const std::vector &input3) : WithInputBuffers(input1, input2, input3), WithOutputBuffer(input1.size()) {} + + void operator()(cl::sycl::handler &cgh) { + auto A = this->getInputBuffer(0).template get_access(cgh); + auto B = this->getInputBuffer(1).template get_access(cgh); + auto C = this->getInputBuffer(2).template get_access(cgh); + auto D = this->getOutputBuffer().template get_access(cgh); + + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("{\n" + "add (M1, 8) %1(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>\n" + "add (M1, 8) %1(0, 0)<1> %1(0, 0)<1;1,0> %3(0, 0)<1;1,0>\n" + "mov (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0>\n" + "}\n" + : "=rw"(D[wiID]), "+rw"(A[wiID]) + : "rw"(B[wiID]), "rw"(C[wiID])); +#else + A[wiID] += B[wiID]; + A[wiID] += C[wiID]; + D[wiID] = A[wiID]; +#endif + }); + } +}; + +int main() { + std::vector inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE), inputC(DEFAULT_PROBLEM_SIZE); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) { + inputA[i] = inputB[i] = i; + inputC[i] = DEFAULT_PROBLEM_SIZE - 2 * i; // A[i] + B[i] + C[i] = LIST_SIZE + } + + KernelFunctor<> f(inputA, inputB, inputC); + if (!launchInlineASMTest(f)) + return 0; + + if (verify_all_the_same(f.getOutputBufferData(), (dataType)DEFAULT_PROBLEM_SIZE)) + return 0; + + return 1; +} diff --git a/sycl/test/feature-tests/inline-asm/asm_no_operands.cpp b/sycl/test/feature-tests/inline-asm/asm_no_operands.cpp new file mode 100644 index 0000000000000..3a3a919caa982 --- /dev/null +++ b/sycl/test/feature-tests/inline-asm/asm_no_operands.cpp @@ -0,0 +1,34 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +class no_operands_kernel; + +int main() { + // Creating SYCL queue + cl::sycl::queue Queue; + cl::sycl::device Device = Queue.get_device(); + + if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + // Size of index space for kernel + cl::sycl::range<1> NumOfWorkItems{16}; + + // Submitting command group(work) to queue + Queue.submit([&](cl::sycl::handler &cgh) { + // Executing kernel + cgh.parallel_for( + NumOfWorkItems, [=](cl::sycl::id<1> WIid) [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("barrier"); +#endif + }); + }); +} diff --git a/sycl/test/feature-tests/inline-asm/asm_no_output.cpp b/sycl/test/feature-tests/inline-asm/asm_no_output.cpp new file mode 100644 index 0000000000000..ff6c65d48b31d --- /dev/null +++ b/sycl/test/feature-tests/inline-asm/asm_no_output.cpp @@ -0,0 +1,47 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include + +using dataType = cl::sycl::cl_int; + +template +struct KernelFunctor : WithOutputBuffer { + KernelFunctor(size_t problem_size) : WithOutputBuffer(problem_size) {} + + void operator()(cl::sycl::handler &cgh) { + auto C = this->getOutputBuffer().template get_access(cgh); + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { + volatile int local_var = 47; + local_var += C[0]; +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm volatile("{\n" + ".decl temp v_type=G type=w num_elts=8 align=GRF\n" + "mov (M1,16) temp(0, 0)<1> %0(0,0)<1;1,0>\n" + "}\n" ::"rw"(local_var)); +#else + volatile int temp = 0; + temp = local_var; +#endif + }); + } +}; + +int main() { + KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); + if (!launchInlineASMTest(f)) + return 0; + + if (verify_all_the_same(f.getOutputBufferData(), 0)) + return 0; + + return 1; +} diff --git a/sycl/test/feature-tests/inline-asm/asm_plus_mod.cpp b/sycl/test/feature-tests/inline-asm/asm_plus_mod.cpp new file mode 100644 index 0000000000000..f65cda777ef9f --- /dev/null +++ b/sycl/test/feature-tests/inline-asm/asm_plus_mod.cpp @@ -0,0 +1,58 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include + +using dataType = cl::sycl::cl_int; + +template +struct KernelFunctor : WithInputBuffers, WithOutputBuffer { + KernelFunctor(const std::vector &input1, const std::vector &input2) : WithInputBuffers(input1), WithOutputBuffer(input2) {} + + void operator()(cl::sycl::handler &cgh) { + auto A = this->getInputBuffer(0).template get_access(cgh); + auto B = this->getOutputBuffer().template get_access(cgh); + + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("add (M1, 16) %0(0, 0)<1> %0(0, 0)<1;1,0> %1(0, 0)<1;1,0>" + : "+rw"(B[wiID]) + : "rw"(A[wiID])); +#else + B[wiID] += A[wiID]; +#endif + }); + } +}; + +int main() { + std::vector inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE), R(DEFAULT_PROBLEM_SIZE); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) { + inputA[i] = i; + inputB[i] = DEFAULT_PROBLEM_SIZE - i; + R[i] = inputA[i] + inputB[i]; + } + + KernelFunctor<> f(inputA, inputB); + if (!launchInlineASMTest(f)) + return 0; + + auto &B = f.getOutputBufferData(); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) { + if (B[i] != R[i]) { + std::cerr << "At index: " << i << ". "; + std::cerr << B[i] << " != " << R[i] << "\n"; + return 1; + } + } + + return 0; +} diff --git a/sycl/test/feature-tests/inline-asm/include/asmhelper.h b/sycl/test/feature-tests/inline-asm/include/asmhelper.h new file mode 100644 index 0000000000000..75585e1611331 --- /dev/null +++ b/sycl/test/feature-tests/inline-asm/include/asmhelper.h @@ -0,0 +1,128 @@ +#include + +#include +#include +#include + +constexpr const size_t DEFAULT_PROBLEM_SIZE = 16; + +template +struct WithOutputBuffer { + WithOutputBuffer(size_t size) { + _output_buffer_data.resize(size); + _output_buffer.reset(new cl::sycl::buffer(_output_buffer_data.data(), _output_buffer_data.size())); + } + + WithOutputBuffer(const std::vector &data) { + _output_buffer_data = data; + _output_buffer.reset(new cl::sycl::buffer(_output_buffer_data.data(), _output_buffer_data.size())); + } + + const std::vector &getOutputBufferData() { + // We cannoe access the data until the buffer is still alive + _output_buffer.reset(); + return _output_buffer_data; + } + + size_t getOutputBufferSize() const { + return _output_buffer_data.size(); + } + +protected: + cl::sycl::buffer &getOutputBuffer() { + return *_output_buffer; + } + + // Functor is being passed by-copy into cl::sycl::queue::submit and destroyed + // one more time in there. We need to make sure that buffer is only released + // once. + std::shared_ptr> _output_buffer = nullptr; + std::vector _output_buffer_data; +}; + +template +struct WithInputBuffers { + + template + WithInputBuffers(Args... inputs) { + static_assert(sizeof...(Args) == N, "All input buffers must be initialized"); + constructorHelper<0>(inputs...); + } + + cl::sycl::buffer &getInputBuffer(size_t i = 0) { + return *_input_buffers[i]; + } + +protected: + std::shared_ptr> _input_buffers[N] = {nullptr}; + std::vector _input_buffers_data[N]; + +private: + template + void constructorHelper(const std::vector &data, Args... rest) { + _input_buffers_data[Index] = data; + _input_buffers[Index].reset(new cl::sycl::buffer(_input_buffers_data[Index].data(), _input_buffers_data[Index].size())); + constructorHelper(rest...); + } + + template + void constructorHelper() { + // nothing to do, recursion stop + } +}; + +bool isInlineASMSupported(sycl::device Device) { + + sycl::string_class DriverVersion = Device.get_info(); + sycl::string_class DeviceVendorName = Device.get_info(); + // TODO: query for some extension/capability/whatever once interface is + // defined + if (DeviceVendorName.find("Intel") == sycl::string_class::npos) + return false; + if (DriverVersion.length() < 5) + return false; + if (DriverVersion[2] != '.') + return false; + if (std::stoi(DriverVersion.substr(0, 2), nullptr, 10) < 20 || std::stoi(DriverVersion.substr(3, 2), nullptr, 10) < 12) + return false; + return true; +} + +/// checks if device suppots inline asm feature and launches a test +/// +/// \returns false if test wasn't launched (i.e.was skipped) and true otherwise +template +bool launchInlineASMTest(F &f, bool requires_particular_sg_size = true) { + try { + cl::sycl::queue deviceQueue(cl::sycl::gpu_selector{}); + cl::sycl::device device = deviceQueue.get_device(); + +#if defined(INLINE_ASM) + if (!isInlineASMSupported(device)) { + std::cout << "Skipping test\n"; + return false; + } +#endif + + if (requires_particular_sg_size && !device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return false; + } + + deviceQueue.submit(f).wait(); + } catch (cl::sycl::exception &e) { + std::cerr << "Caught exception: " << e.what() << std::endl; + } + return true; +} + +template +bool verify_all_the_same(const std::vector &input, T reference_value) { + for (int i = 0; i < input.size(); ++i) + if (input[i] != reference_value) { + std::cerr << "At index: " << i << " "; + std::cerr << input[i] << " != " << reference_value << "\n"; + return false; + } + return true; +} diff --git a/sycl/test/feature-tests/inline-asm/letter_example.cpp b/sycl/test/feature-tests/inline-asm/letter_example.cpp new file mode 100644 index 0000000000000..22bf26648e78b --- /dev/null +++ b/sycl/test/feature-tests/inline-asm/letter_example.cpp @@ -0,0 +1,66 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include + +constexpr size_t problem_size = 16; + +class kernel_name; + +int main() { + cl::sycl::queue q; + cl::sycl::device Device = q.get_device(); + + if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + auto ctx = q.get_context(); + int *a = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx); + for (int i = 0; i < problem_size; i++) { + a[i] = i; + } + q.submit([&](cl::sycl::handler &cgh) { + cgh.parallel_for( + cl::sycl::range<1>(problem_size), [=](cl::sycl::id<1> idx) + [[cl::intel_reqd_sub_group_size(16)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + int i = idx[0]; + asm volatile("{\n.decl V52 v_type=G type=d num_elts=16 align=GRF\n" + "svm_gather.4.1 (M1, 16) %0.0 V52.0\n" + "add(M1, 16) V52(0, 0)<1> V52(0, 0)<1; 1, 0> 0x1:w\n" + "svm_scatter.4.1 (M1, 16) %0.0 V52.0\n}" + : + : "rw"(&a[i])); +#else + a[idx[0]]++; +#endif + }); + }).wait(); + + bool currect = true; + for (int i = 0; i < problem_size; i++) { + if (a[i] != (i + 1)) { + currect = false; + std::cerr << "error in a[" << i << "]=" + << a[i] << "!=" << (i + 1) << std::endl; + break; + } + } + + if (!currect) { + std::cerr << "Error" << std::endl; + cl::sycl::free(a, ctx); + return 1; + } + + std::cerr << "Pass" << std::endl; + cl::sycl::free(a, ctx); + return 0; +} diff --git a/sycl/test/feature-tests/inline-asm/malloc_shared_32.cpp b/sycl/test/feature-tests/inline-asm/malloc_shared_32.cpp new file mode 100644 index 0000000000000..8f058851c2681 --- /dev/null +++ b/sycl/test/feature-tests/inline-asm/malloc_shared_32.cpp @@ -0,0 +1,92 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include + +constexpr size_t problem_size = 32; + +class kernel_name; + +int main() { + cl::sycl::queue q; + + cl::sycl::device Device = q.get_device(); + + if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + + auto ctx = q.get_context(); + int *a = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx); + int *b = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx); + int *c = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx); + for (int i = 0; i < problem_size; i++) { + b[i] = -10; + a[i] = i; + c[i] = i; + } + + q.submit([&](cl::sycl::handler &cgh) { + cgh.parallel_for( + cl::sycl::range<1>(problem_size), + [=](cl::sycl::id<1> idx) + [[cl::intel_reqd_sub_group_size(32)]] { + int i = idx[0]; +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm volatile(R"a( + { + .decl V52 v_type=G type=d num_elts=16 align=GRF + .decl V53 v_type=G type=d num_elts=16 align=GRF + .decl V54 v_type=G type=d num_elts=16 align=GRF + .decl V55 v_type=G type=d num_elts=16 align=GRF + .decl V56 v_type=G type=d num_elts=16 align=GRF + .decl V57 v_type=G type=d num_elts=16 align=GRF + svm_gather.4.1 (M1, 16) %2.0 V54.0 + svm_gather.4.1 (M1, 16) %3.0 V55.0 + svm_gather.4.1 (M1, 16) %4.0 V56.0 + svm_gather.4.1 (M1, 16) %5.0 V57.0 + mul (M1, 16) V52(0,0)<1> V54(0,0)<1;1,0> V56(0,0)<1;1,0> + mul (M1, 16) V53(0,0)<1> V55(0,0)<1;1,0> V57(0,0)<1;1,0> + svm_scatter.4.1 (M1, 16) %0.0 V52.0 + svm_scatter.4.1 (M1, 16) %1.0 V53.0 + } + )a" ::"rw"(&b[i]), + "rw"(&b[i] + 16), "rw"(&a[i]), "rw"(&a[i] + 16), "rw"(&c[i]), + "rw"(&c[i] + 16)); +#else + b[i] = a[i] * c[i]; +#endif + }); + }).wait(); + + bool currect = true; + for (int i = 0; i < problem_size; i++) { + if (b[i] != a[i] * b[i]) { + currect = false; + std::cerr << "error in a[" << i << "]=" + << b[i] << "!=" << a[i] * b[i] << std::endl; + break; + } + } + + if (!currect) { + std::cerr << "Error" << std::endl; + cl::sycl::free(a, ctx); + cl::sycl::free(b, ctx); + cl::sycl::free(c, ctx); + return 1; + } + + std::cerr << "Pass" << std::endl; + cl::sycl::free(a, ctx); + cl::sycl::free(b, ctx); + cl::sycl::free(c, ctx); + return 0; +} diff --git a/sycl/test/feature-tests/inline-asm/malloc_shared_in_out_dif.cpp b/sycl/test/feature-tests/inline-asm/malloc_shared_in_out_dif.cpp new file mode 100644 index 0000000000000..a6994bd379199 --- /dev/null +++ b/sycl/test/feature-tests/inline-asm/malloc_shared_in_out_dif.cpp @@ -0,0 +1,69 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out + +#include "include/asmhelper.h" +#include +#include + +constexpr size_t problem_size = 100; + +class kernel_name; + +int main() { + cl::sycl::queue q; + + cl::sycl::device Device = q.get_device(); + + if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + + auto ctx = q.get_context(); + int *a = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx); + int *b = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx); + for (int i = 0; i < problem_size; i++) { + b[i] = -1; + a[i] = i; + } + + q.submit([&](cl::sycl::handler &cgh) { + cgh.parallel_for( + cl::sycl::range<1>(problem_size), [=](cl::sycl::id<1> idx) [[cl::intel_reqd_sub_group_size(16)]] { + int i = idx[0]; + volatile int tmp = a[i]; + tmp += 1; +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm volatile(" add (M1, 16) %0(0,0)<1> %0(0,0)<1;1,0> %1(0,0)<1;1,0>" + : "+rw"(b[i]) + : "rw"(tmp)); +#else + b[i] += tmp; +#endif + }); + }).wait(); + + bool currect = true; + for (int i = 0; i < problem_size; i++) { + if (b[i] != a[i]) { + currect = false; + std::cerr << "error in a[" << i << "]=" + << b[i] << "!=" << a[i] << std::endl; + break; + } + } + + if (!currect) { + std::cerr << "Error" << std::endl; + cl::sycl::free(a, ctx); + cl::sycl::free(b, ctx); + return 1; + } + + std::cerr << "Pass" << std::endl; + cl::sycl::free(a, ctx); + cl::sycl::free(b, ctx); + return 0; +} diff --git a/sycl/test/feature-tests/inline-asm/malloc_shared_no_input.cpp b/sycl/test/feature-tests/inline-asm/malloc_shared_no_input.cpp new file mode 100644 index 0000000000000..22cd47abd64ab --- /dev/null +++ b/sycl/test/feature-tests/inline-asm/malloc_shared_no_input.cpp @@ -0,0 +1,61 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include + +constexpr size_t problem_size = 16; + +class kernel_name; + +int main() { + cl::sycl::queue q; + cl::sycl::device Device = q.get_device(); + + if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + auto ctx = q.get_context(); + int *a = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx); + for (int i = 0; i < problem_size; i++) + a[i] = i; + + q.submit([&](cl::sycl::handler &cgh) { + cgh.parallel_for( + cl::sycl::range<1>(problem_size), [=](cl::sycl::id<1> idx) [[cl::intel_reqd_sub_group_size(16)]] { + int i = idx[0]; +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm volatile("mov (M1, 16) %0(0,0)<1> 0x7:d" + : "=rw"(a[i])); +#else + a[i] = 7; +#endif + }); + }).wait(); + + bool currect = true; + for (int i = 0; i < problem_size; i++) { + if (a[i] != 7) { + currect = false; + std::cerr << "error in a[" << i << "]=" + << a[i] << "!=" << 7 << std::endl; + break; + } + } + + if (!currect) { + std::cerr << "Error" << std::endl; + cl::sycl::free(a, ctx); + return 1; + } + + std::cerr << "Pass" << std::endl; + cl::sycl::free(a, ctx); + return 0; +} diff --git a/sycl/test/lit.cfg.py b/sycl/test/lit.cfg.py index e6ae419047b9d..6285c44d27632 100644 --- a/sycl/test/lit.cfg.py +++ b/sycl/test/lit.cfg.py @@ -26,7 +26,8 @@ # suffixes: A list of file extensions to treat as test files. config.suffixes = ['.c', '.cpp'] #add .spv. Currently not clear what to do with those -config.excludes = ['Inputs'] +# feature tests are considered not so lightweight, so, they are excluded by default +config.excludes = ['Inputs', 'feature-tests'] # test_source_root: The root path where tests are located. config.test_source_root = os.path.dirname(__file__)