From 475cb79bb86ec5accd80333d99f65411ce92c822 Mon Sep 17 00:00:00 2001 From: Vladimir Lazarev Date: Thu, 21 May 2020 11:39:19 +0300 Subject: [PATCH] [SYCL] dpc++ tests --- SYCL/Basic/CMakeLists.txt | 16 + SYCL/Basic/README.md | 90 +++ SYCL/Basic/aot/Inputs/aot.cpp | 76 ++ SYCL/Basic/aot/accelerator.cpp | 13 + SYCL/Basic/aot/cpu.cpp | 12 + SYCL/Basic/aot/gpu.cpp | 14 + SYCL/Basic/aot/spec_const_aot.cpp | 66 ++ SYCL/Basic/aot/with-llvm-bc.cpp | 17 + SYCL/Basic/bit_cast/bit_cast.cpp | 84 +++ SYCL/Basic/built-ins/nan.cpp | 72 ++ SYCL/Basic/built-ins/printf.cpp | 134 ++++ SYCL/Basic/built-ins/scalar_common.cpp | 34 + SYCL/Basic/built-ins/scalar_geometric.cpp | 131 ++++ SYCL/Basic/built-ins/scalar_integer.cpp | 571 ++++++++++++++ SYCL/Basic/built-ins/scalar_math.cpp | 401 ++++++++++ SYCL/Basic/built-ins/scalar_math_2.cpp | 244 ++++++ SYCL/Basic/built-ins/scalar_relational.cpp | 422 +++++++++++ SYCL/Basic/built-ins/vector_common.cpp | 57 ++ SYCL/Basic/built-ins/vector_geometric.cpp | 171 +++++ SYCL/Basic/built-ins/vector_integer.cpp | 701 ++++++++++++++++++ SYCL/Basic/built-ins/vector_math.cpp | 210 ++++++ SYCL/Basic/built-ins/vector_relational.cpp | 608 +++++++++++++++ SYCL/Basic/config/allowlist.cpp | 90 +++ SYCL/Basic/config/config.cpp | 26 + .../Inputs/split-per-source-second-file.cpp | 21 + .../Inputs/split-per-source.h | 7 + .../device-code-split/aot-accelerator.cpp | 5 + SYCL/Basic/device-code-split/aot-cpu.cpp | 4 + SYCL/Basic/device-code-split/aot-gpu.cpp | 11 + .../device-code-split/split-per-kernel.cpp | 68 ++ .../split-per-source-main.cpp | 54 ++ SYCL/Basic/devicelib/assert-windows.cpp | 75 ++ SYCL/Basic/devicelib/assert.cpp | 215 ++++++ .../devicelib/c99_complex_math_fp64_test.cpp | 256 +++++++ .../Basic/devicelib/c99_complex_math_test.cpp | 258 +++++++ SYCL/Basic/devicelib/cmath_fp64_test.cpp | 118 +++ SYCL/Basic/devicelib/cmath_test.cpp | 115 +++ SYCL/Basic/devicelib/math_fp64_test.cpp | 115 +++ .../devicelib/math_fp64_windows_test.cpp | 132 ++++ SYCL/Basic/devicelib/math_override_test.cpp | 49 ++ SYCL/Basic/devicelib/math_test.cpp | 113 +++ SYCL/Basic/devicelib/math_utils.hpp | 29 + SYCL/Basic/devicelib/math_windows_test.cpp | 121 +++ .../devicelib/std_complex_math_fp64_test.cpp | 206 +++++ .../Basic/devicelib/std_complex_math_test.cpp | 204 +++++ .../Basic/enqueue_barrier/enqueue_barrier.cpp | 78 ++ .../feature-tests/inline-asm/asm_16_empty.cpp | 40 + .../inline-asm/asm_16_matrix_mult.cpp | 44 ++ .../inline-asm/asm_16_no_input_int.cpp | 44 ++ .../inline-asm/asm_16_no_opts.cpp | 45 ++ .../feature-tests/inline-asm/asm_8_empty.cpp | 40 + .../inline-asm/asm_8_no_input_int.cpp | 44 ++ .../inline-asm/asm_arbitrary_ops_order.cpp | 59 ++ .../inline-asm/asm_decl_in_scope.cpp | 67 ++ .../inline-asm/asm_float_add.cpp | 59 ++ .../inline-asm/asm_float_imm_arg.cpp | 56 ++ .../inline-asm/asm_float_neg.cpp | 57 ++ .../feature-tests/inline-asm/asm_imm_arg.cpp | 55 ++ .../feature-tests/inline-asm/asm_mul.cpp | 57 ++ .../inline-asm/asm_multiple_instructions.cpp | 59 ++ .../inline-asm/asm_no_operands.cpp | 34 + .../inline-asm/asm_no_output.cpp | 47 ++ .../feature-tests/inline-asm/asm_plus_mod.cpp | 58 ++ .../inline-asm/include/asmhelper.h | 128 ++++ .../inline-asm/letter_example.cpp | 66 ++ .../inline-asm/malloc_shared_32.cpp | 92 +++ .../inline-asm/malloc_shared_in_out_dif.cpp | 69 ++ .../inline-asm/malloc_shared_no_input.cpp | 61 ++ SYCL/Basic/fpga_tests/Inputs/fpga_device.cpp | 24 + SYCL/Basic/fpga_tests/Inputs/fpga_host.cpp | 23 + SYCL/Basic/fpga_tests/fpga_aocx.cpp | 24 + SYCL/Basic/fpga_tests/fpga_aocx_win.cpp | 24 + SYCL/Basic/fpga_tests/fpga_io_pipes.cpp | 134 ++++ SYCL/Basic/fpga_tests/fpga_pipes.cpp | 326 ++++++++ .../Basic/fpga_tests/fpga_pipes_legacy_ns.cpp | 63 ++ SYCL/Basic/fpga_tests/fpga_queue.cpp | 168 +++++ .../global_fpga_device_selector.cpp | 18 + SYCL/Basic/fpga_tests/io_pipe_def.h | 12 + SYCL/Basic/fpga_tests/pipes_info.cpp | 36 + SYCL/Basic/functor/kernel_functor.cpp | 180 +++++ SYCL/Basic/group-algorithm/all_of.cpp | 77 ++ SYCL/Basic/group-algorithm/any_of.cpp | 79 ++ SYCL/Basic/group-algorithm/broadcast.cpp | 65 ++ SYCL/Basic/group-algorithm/exclusive_scan.cpp | 147 ++++ SYCL/Basic/group-algorithm/inclusive_scan.cpp | 147 ++++ SYCL/Basic/group-algorithm/leader.cpp | 50 ++ SYCL/Basic/group-algorithm/none_of.cpp | 77 ++ SYCL/Basic/group-algorithm/reduce.cpp | 85 +++ SYCL/Basic/helpers.hpp | 76 ++ .../host-task-dependency.cpp | 200 +++++ .../host-task-two-queues.cpp | 82 ++ SYCL/Basic/lit.cfg.py | 210 ++++++ SYCL/Basic/lit.site.cfg.py.in | 29 + SYCL/Basic/spec_const/spec_const_hw.cpp | 121 +++ SYCL/Basic/spec_const/spec_const_redefine.cpp | 112 +++ .../struct_param/non-standard-layout.cpp | 45 ++ .../struct_param/struct_kernel_param.cpp | 137 ++++ SYCL/Basic/sub_group/attributes.cpp | 125 ++++ SYCL/Basic/sub_group/barrier.cpp | 90 +++ SYCL/Basic/sub_group/broadcast.cpp | 87 +++ SYCL/Basic/sub_group/common.cpp | 93 +++ SYCL/Basic/sub_group/common_ocl.cpp | 111 +++ SYCL/Basic/sub_group/helper.hpp | 157 ++++ SYCL/Basic/sub_group/info.cpp | 93 +++ SYCL/Basic/sub_group/load_store.cpp | 205 +++++ SYCL/Basic/sub_group/reduce.cpp | 125 ++++ SYCL/Basic/sub_group/scan.cpp | 160 ++++ SYCL/Basic/sub_group/sg.cl | 25 + SYCL/Basic/sub_group/shuffle.cpp | 265 +++++++ SYCL/Basic/sub_group/vote.cpp | 89 +++ SYCL/Basic/usm/allocator_vector.cpp | 130 ++++ SYCL/Basic/usm/allocator_vector_fail.cpp | 48 ++ SYCL/Basic/usm/allocatorll.cpp | 88 +++ SYCL/Basic/usm/badmalloc.cpp | 78 ++ SYCL/Basic/usm/depends_on.cpp | 86 +++ SYCL/Basic/usm/dmemll.cpp | 93 +++ SYCL/Basic/usm/dmemllaligned.cpp | 90 +++ SYCL/Basic/usm/findplatforms.hpp | 45 ++ SYCL/Basic/usm/hmemll.cpp | 86 +++ SYCL/Basic/usm/hmemllaligned.cpp | 82 ++ SYCL/Basic/usm/math.cpp | 134 ++++ SYCL/Basic/usm/memadvise.cpp | 87 +++ SYCL/Basic/usm/memcpy.cpp | 63 ++ SYCL/Basic/usm/memset.cpp | 59 ++ SYCL/Basic/usm/mixed.cpp | 79 ++ SYCL/Basic/usm/mixed2.cpp | 79 ++ SYCL/Basic/usm/mixed2template.cpp | 92 +++ SYCL/Basic/usm/mixed_queue.cpp | 108 +++ SYCL/Basic/usm/multictxt.cpp | 66 ++ SYCL/Basic/usm/pfor_flatten.cpp | 71 ++ SYCL/Basic/usm/pointer_query.cpp | 123 +++ SYCL/Basic/usm/prefetch.cpp | 69 ++ SYCL/Basic/usm/queue_wait.cpp | 48 ++ SYCL/Basic/usm/smemll.cpp | 86 +++ SYCL/Basic/usm/smemllaligned.cpp | 83 +++ SYCL/CMakeLists.txt | 5 + SYCL/README.md | 7 + cmake/caches/clang_fsycl.cmake | 4 + cmake/caches/clang_fsycl_cuda.cmake | 4 + cmake/caches/dpcpp.cmake | 5 + 140 files changed, 14289 insertions(+) create mode 100644 SYCL/Basic/CMakeLists.txt create mode 100644 SYCL/Basic/README.md create mode 100644 SYCL/Basic/aot/Inputs/aot.cpp create mode 100644 SYCL/Basic/aot/accelerator.cpp create mode 100644 SYCL/Basic/aot/cpu.cpp create mode 100644 SYCL/Basic/aot/gpu.cpp create mode 100644 SYCL/Basic/aot/spec_const_aot.cpp create mode 100644 SYCL/Basic/aot/with-llvm-bc.cpp create mode 100644 SYCL/Basic/bit_cast/bit_cast.cpp create mode 100644 SYCL/Basic/built-ins/nan.cpp create mode 100644 SYCL/Basic/built-ins/printf.cpp create mode 100644 SYCL/Basic/built-ins/scalar_common.cpp create mode 100644 SYCL/Basic/built-ins/scalar_geometric.cpp create mode 100644 SYCL/Basic/built-ins/scalar_integer.cpp create mode 100644 SYCL/Basic/built-ins/scalar_math.cpp create mode 100644 SYCL/Basic/built-ins/scalar_math_2.cpp create mode 100644 SYCL/Basic/built-ins/scalar_relational.cpp create mode 100644 SYCL/Basic/built-ins/vector_common.cpp create mode 100644 SYCL/Basic/built-ins/vector_geometric.cpp create mode 100644 SYCL/Basic/built-ins/vector_integer.cpp create mode 100644 SYCL/Basic/built-ins/vector_math.cpp create mode 100644 SYCL/Basic/built-ins/vector_relational.cpp create mode 100644 SYCL/Basic/config/allowlist.cpp create mode 100644 SYCL/Basic/config/config.cpp create mode 100644 SYCL/Basic/device-code-split/Inputs/split-per-source-second-file.cpp create mode 100644 SYCL/Basic/device-code-split/Inputs/split-per-source.h create mode 100644 SYCL/Basic/device-code-split/aot-accelerator.cpp create mode 100644 SYCL/Basic/device-code-split/aot-cpu.cpp create mode 100644 SYCL/Basic/device-code-split/aot-gpu.cpp create mode 100644 SYCL/Basic/device-code-split/split-per-kernel.cpp create mode 100644 SYCL/Basic/device-code-split/split-per-source-main.cpp create mode 100644 SYCL/Basic/devicelib/assert-windows.cpp create mode 100644 SYCL/Basic/devicelib/assert.cpp create mode 100644 SYCL/Basic/devicelib/c99_complex_math_fp64_test.cpp create mode 100644 SYCL/Basic/devicelib/c99_complex_math_test.cpp create mode 100644 SYCL/Basic/devicelib/cmath_fp64_test.cpp create mode 100644 SYCL/Basic/devicelib/cmath_test.cpp create mode 100644 SYCL/Basic/devicelib/math_fp64_test.cpp create mode 100644 SYCL/Basic/devicelib/math_fp64_windows_test.cpp create mode 100644 SYCL/Basic/devicelib/math_override_test.cpp create mode 100644 SYCL/Basic/devicelib/math_test.cpp create mode 100644 SYCL/Basic/devicelib/math_utils.hpp create mode 100644 SYCL/Basic/devicelib/math_windows_test.cpp create mode 100644 SYCL/Basic/devicelib/std_complex_math_fp64_test.cpp create mode 100644 SYCL/Basic/devicelib/std_complex_math_test.cpp create mode 100644 SYCL/Basic/enqueue_barrier/enqueue_barrier.cpp create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_16_empty.cpp create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_16_matrix_mult.cpp create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_16_no_input_int.cpp create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_16_no_opts.cpp create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_8_empty.cpp create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_8_no_input_int.cpp create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_arbitrary_ops_order.cpp create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_decl_in_scope.cpp create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_float_add.cpp create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_float_imm_arg.cpp create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_float_neg.cpp create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_imm_arg.cpp create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_mul.cpp create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_multiple_instructions.cpp create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_no_operands.cpp create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_no_output.cpp create mode 100644 SYCL/Basic/feature-tests/inline-asm/asm_plus_mod.cpp create mode 100644 SYCL/Basic/feature-tests/inline-asm/include/asmhelper.h create mode 100644 SYCL/Basic/feature-tests/inline-asm/letter_example.cpp create mode 100644 SYCL/Basic/feature-tests/inline-asm/malloc_shared_32.cpp create mode 100644 SYCL/Basic/feature-tests/inline-asm/malloc_shared_in_out_dif.cpp create mode 100644 SYCL/Basic/feature-tests/inline-asm/malloc_shared_no_input.cpp create mode 100644 SYCL/Basic/fpga_tests/Inputs/fpga_device.cpp create mode 100644 SYCL/Basic/fpga_tests/Inputs/fpga_host.cpp create mode 100644 SYCL/Basic/fpga_tests/fpga_aocx.cpp create mode 100644 SYCL/Basic/fpga_tests/fpga_aocx_win.cpp create mode 100644 SYCL/Basic/fpga_tests/fpga_io_pipes.cpp create mode 100644 SYCL/Basic/fpga_tests/fpga_pipes.cpp create mode 100644 SYCL/Basic/fpga_tests/fpga_pipes_legacy_ns.cpp create mode 100644 SYCL/Basic/fpga_tests/fpga_queue.cpp create mode 100644 SYCL/Basic/fpga_tests/global_fpga_device_selector.cpp create mode 100644 SYCL/Basic/fpga_tests/io_pipe_def.h create mode 100644 SYCL/Basic/fpga_tests/pipes_info.cpp create mode 100644 SYCL/Basic/functor/kernel_functor.cpp create mode 100644 SYCL/Basic/group-algorithm/all_of.cpp create mode 100644 SYCL/Basic/group-algorithm/any_of.cpp create mode 100644 SYCL/Basic/group-algorithm/broadcast.cpp create mode 100644 SYCL/Basic/group-algorithm/exclusive_scan.cpp create mode 100644 SYCL/Basic/group-algorithm/inclusive_scan.cpp create mode 100644 SYCL/Basic/group-algorithm/leader.cpp create mode 100644 SYCL/Basic/group-algorithm/none_of.cpp create mode 100644 SYCL/Basic/group-algorithm/reduce.cpp create mode 100644 SYCL/Basic/helpers.hpp create mode 100644 SYCL/Basic/host-interop-task/host-task-dependency.cpp create mode 100644 SYCL/Basic/host-interop-task/host-task-two-queues.cpp create mode 100644 SYCL/Basic/lit.cfg.py create mode 100644 SYCL/Basic/lit.site.cfg.py.in create mode 100644 SYCL/Basic/spec_const/spec_const_hw.cpp create mode 100644 SYCL/Basic/spec_const/spec_const_redefine.cpp create mode 100644 SYCL/Basic/struct_param/non-standard-layout.cpp create mode 100644 SYCL/Basic/struct_param/struct_kernel_param.cpp create mode 100644 SYCL/Basic/sub_group/attributes.cpp create mode 100644 SYCL/Basic/sub_group/barrier.cpp create mode 100644 SYCL/Basic/sub_group/broadcast.cpp create mode 100644 SYCL/Basic/sub_group/common.cpp create mode 100644 SYCL/Basic/sub_group/common_ocl.cpp create mode 100644 SYCL/Basic/sub_group/helper.hpp create mode 100644 SYCL/Basic/sub_group/info.cpp create mode 100644 SYCL/Basic/sub_group/load_store.cpp create mode 100644 SYCL/Basic/sub_group/reduce.cpp create mode 100644 SYCL/Basic/sub_group/scan.cpp create mode 100644 SYCL/Basic/sub_group/sg.cl create mode 100644 SYCL/Basic/sub_group/shuffle.cpp create mode 100644 SYCL/Basic/sub_group/vote.cpp create mode 100644 SYCL/Basic/usm/allocator_vector.cpp create mode 100644 SYCL/Basic/usm/allocator_vector_fail.cpp create mode 100644 SYCL/Basic/usm/allocatorll.cpp create mode 100644 SYCL/Basic/usm/badmalloc.cpp create mode 100644 SYCL/Basic/usm/depends_on.cpp create mode 100644 SYCL/Basic/usm/dmemll.cpp create mode 100644 SYCL/Basic/usm/dmemllaligned.cpp create mode 100644 SYCL/Basic/usm/findplatforms.hpp create mode 100644 SYCL/Basic/usm/hmemll.cpp create mode 100644 SYCL/Basic/usm/hmemllaligned.cpp create mode 100644 SYCL/Basic/usm/math.cpp create mode 100644 SYCL/Basic/usm/memadvise.cpp create mode 100644 SYCL/Basic/usm/memcpy.cpp create mode 100644 SYCL/Basic/usm/memset.cpp create mode 100644 SYCL/Basic/usm/mixed.cpp create mode 100644 SYCL/Basic/usm/mixed2.cpp create mode 100644 SYCL/Basic/usm/mixed2template.cpp create mode 100644 SYCL/Basic/usm/mixed_queue.cpp create mode 100644 SYCL/Basic/usm/multictxt.cpp create mode 100644 SYCL/Basic/usm/pfor_flatten.cpp create mode 100644 SYCL/Basic/usm/pointer_query.cpp create mode 100644 SYCL/Basic/usm/prefetch.cpp create mode 100644 SYCL/Basic/usm/queue_wait.cpp create mode 100644 SYCL/Basic/usm/smemll.cpp create mode 100644 SYCL/Basic/usm/smemllaligned.cpp create mode 100644 SYCL/CMakeLists.txt create mode 100644 SYCL/README.md create mode 100644 cmake/caches/clang_fsycl.cmake create mode 100644 cmake/caches/clang_fsycl_cuda.cmake create mode 100644 cmake/caches/dpcpp.cmake diff --git a/SYCL/Basic/CMakeLists.txt b/SYCL/Basic/CMakeLists.txt new file mode 100644 index 0000000000..be67381e43 --- /dev/null +++ b/SYCL/Basic/CMakeLists.txt @@ -0,0 +1,16 @@ +set(LLVM_TOOLS_DIR "${LLVM_BINARY_DIR}/bin/") + +#get_target_property(SYCL_BINARY_DIR sycl-toolchain BINARY_DIR) + +set(SYCL_INCLUDE "${SYCL_INCLUDE_BUILD_DIR}") +set(SYCL_TOOLS_SRC_DIR "${PROJECT_SOURCE_DIR}/tools/") +set(LLVM_BUILD_BINARY_DIRS "${LLVM_BINARY_DIR}/bin/") +set(LLVM_BUILD_LIBRARY_DIRS "${LLVM_BINARY_DIR}/lib/") + +set(RT_TEST_ARGS ${RT_TEST_ARGS} "-v") +set(DEPLOY_RT_TEST_ARGS ${DEPLOY_RT_TEST_ARGS} "-v -D SYCL_TOOLS_DIR=${CMAKE_INSTALL_PREFIX}/bin -D SYCL_LIBS_DIR=${CMAKE_INSTALL_PREFIX}/lib${LLVM_LIBDIR_SUFFIX} -D SYCL_INCLUDE=${SYCL_INCLUDE_DEPLOY_DIR}") + +find_package(Threads REQUIRED) +set(SYCL_THREADS_LIB ${CMAKE_THREAD_LIBS_INIT}) + +configure_file("${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in" "${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg") diff --git a/SYCL/Basic/README.md b/SYCL/Basic/README.md new file mode 100644 index 0000000000..135973162c --- /dev/null +++ b/SYCL/Basic/README.md @@ -0,0 +1,90 @@ +# Overview +SYCL related test based on SYCL-LIT. These tests support +execution on all supported devices and SYCL backends. + +# Table of contents + * [Execution](#execution) + * [Main parameters](#main-parameters) + * [LIT features which can be used to configure test execution](#lit-features-which-can-be-used-to-configure-test-execution) + +# Execution +``` +git clone # e.g. https://github.com/vladimirlaz/llvm-test-suite +cd llvm-test-suite +mkdir build +cd build +# configuring test execution (selecting compiler version, target BE and target device) +cmake -G Ninja -DTEST_SUITE_SUBDIRS=SYCL -DTEST_SUITE_LIT= -DSYCL_BE= -DSYCL_TARGET_DEVICES= -C .. +# Building full list of tests in subdir +ninja check +# or +llvm-lit . +# Get list of available tests +llvm-lit . --show-tests +# Run specific test +llvm-lit +``` + +Notes: + - it is assumed that LIT framework, FileCheck and other LIT dependencies are available in the same directory with llvm-lit. + - compiler variant as well as compile/link options are defined in cashed cmake configurations: + - [dpcpp.cmake](../../cmake/caches/dpcpp.cmake) + - [clang_fsycl.cmake](../../cmake/cashes/clang_fsycl.cmake) + - [clang_fsycl_cuda.cmake](../../cmake/cashes/clang_fsycl_cuda.cmake) + - compiler is taken from environment. + +# Main parameters +It is possible to change tets scope my specifying test directory/file in first +argument to for thelit-runner.py script. + +***SYCL_TARGET_DEVICES*** should point to the directory containing DPCPP compiler + +***SYCL_TARGET_DEVICES*** defines comma separated target device types (default value is + cpu,gpu,acc,host). Supported target_devices values are: + - **cpu** - CPU device available in OpenCL backend only; + - **gpu** - GPU device available in OpenCL, Level Zero and CUDA backends; + - **acc** - FPGA emulator device available in OpenCL backend only; + - **host** - SYCL Host device availabel with all backends. + +***SYCL_BE*** defined SYCL backend to be used for testing (default is PI_OPENCL). +Supported sycl_be values: + - PI_OPENCL - for OpenCL backend; + - PI_CUDA - for CUDA backend; + - PI_LEVEL0 - Level Zero backend. + +It is asssumed that all dependencies (OpenCL runtimes, +CUDA SDK, AOT compilers, etc) are available in the system. + +See examples below for configuring tests targetting different devices: + - SYCL host: +``` +cmake -G Ninja -DTEST_SUITE_COLLECT_CODE_SIZE=OFF -DTEST_SUITE_COLLECT_COMPILE_TIME=OFF -DTEST_SUITE_SUBDIRS=SYCL -DTEST_SUITE_LIT= -DSYCL_BE=PI_OPENCL -DSYCL_TARGET_DEVICES="host" -C../cmake/caches/clang_fsycl.cmake .. +``` + - OpenCL GPU +``` +cmake -G Ninja -DTEST_SUITE_COLLECT_CODE_SIZE=OFF -DTEST_SUITE_COLLECT_COMPILE_TIME=OFF -DTEST_SUITE_SUBDIRS=SYCL -DTEST_SUITE_LIT= -DSYCL_BE=PI_OPENCL -DSYCL_TARGET_DEVICES="gpu" -C../cmake/caches/clang_fsycl.cmake .. +``` + - OpenCL CPU +``` +cmake -G Ninja -DTEST_SUITE_COLLECT_CODE_SIZE=OFF -DTEST_SUITE_COLLECT_COMPILE_TIME=OFF -DTEST_SUITE_SUBDIRS=SYCL -DTEST_SUITE_LIT= -DSYCL_BE=PI_OPENCL -DSYCL_TARGET_DEVICES="gpu" -C../cmake/caches/clang_fsycl.cmake .. +``` + - OpenCL FPGA emulator +``` +cmake -G Ninja -DTEST_SUITE_COLLECT_CODE_SIZE=OFF -DTEST_SUITE_COLLECT_COMPILE_TIME=OFF -DTEST_SUITE_SUBDIRS=SYCL -DTEST_SUITE_LIT= -DSYCL_BE=PI_OPENCL -DSYCL_TARGET_DEVICES="gpu" -C../cmake/caches/clang_fsycl.cmake .. +``` + - CUDA GPU +``` +cmake -G Ninja -DTEST_SUITE_COLLECT_CODE_SIZE=OFF -DTEST_SUITE_COLLECT_COMPILE_TIME=OFF -DTEST_SUITE_SUBDIRS=SYCL -DTEST_SUITE_LIT= -DSYCL_BE=PI_CUDA -DSYCL_TARGET_DEVICES="gpu" -C../cmake/caches/clang_fsycl_cuda.cmake .. +``` + - Level Zero GPU +``` +cmake -G Ninja -DTEST_SUITE_COLLECT_CODE_SIZE=OFF -DTEST_SUITE_COLLECT_COMPILE_TIME=OFF -DTEST_SUITE_SUBDIRS=SYCL -DTEST_SUITE_LIT= -DSYCL_BE=PI_LEVEL0 -DSYCL_TARGET_DEVICES="gpu" -C../cmake/caches/clang_fsycl.cmake .. +``` + +# LIT features which can be used to configure test execution: + - **windows**, **linux** - host OS; + - **cpu**, **gpu**, **host**, **acc** - target devices; + - **cuda**, **opencl**, **level0** - target backend; + - **sycl-ls** - sycl-ls tool is available; + - **dump_ir**: is set to true if compiler supports dumiping IR. Can be set by setting DUMP_IR_SUPPORTED in cmake. Default is false. + diff --git a/SYCL/Basic/aot/Inputs/aot.cpp b/SYCL/Basic/aot/Inputs/aot.cpp new file mode 100644 index 0000000000..46f768dfa5 --- /dev/null +++ b/SYCL/Basic/aot/Inputs/aot.cpp @@ -0,0 +1,76 @@ +//==----- aot.cpp - Simple vector addition (AOT compilation example) --------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===------------------------------------------------------------------------===// + +#include + +#include +#include + +constexpr cl::sycl::access::mode sycl_read = cl::sycl::access::mode::read; +constexpr cl::sycl::access::mode sycl_write = cl::sycl::access::mode::write; + +template +class SimpleVadd; + +template +void simple_vadd(const std::array &VA, const std::array &VB, + std::array &VC) { + cl::sycl::queue deviceQueue([](cl::sycl::exception_list ExceptionList) { + for (cl::sycl::exception_ptr_class ExceptionPtr : ExceptionList) { + try { + std::rethrow_exception(ExceptionPtr); + } catch (cl::sycl::exception &E) { + std::cerr << E.what(); + } catch (...) { + std::cerr << "Unknown async exception was caught." << std::endl; + } + } + }); + + cl::sycl::range<1> numOfItems{N}; + cl::sycl::buffer bufferA(VA.data(), numOfItems); + cl::sycl::buffer bufferB(VB.data(), numOfItems); + cl::sycl::buffer bufferC(VC.data(), numOfItems); + + deviceQueue.submit([&](cl::sycl::handler &cgh) { + auto accessorA = bufferA.template get_access(cgh); + auto accessorB = bufferB.template get_access(cgh); + auto accessorC = bufferC.template get_access(cgh); + + cgh.parallel_for>(numOfItems, + [=](cl::sycl::id<1> wiID) { + accessorC[wiID] = accessorA[wiID] + accessorB[wiID]; + }); + }); + + deviceQueue.wait_and_throw(); +} + +int main() { + const size_t array_size = 4; + std::array A = {{1, 2, 3, 4}}, + B = {{1, 2, 3, 4}}, C; + std::array D = {{1.f, 2.f, 3.f, 4.f}}, + E = {{1.f, 2.f, 3.f, 4.f}}, F; + simple_vadd(A, B, C); + simple_vadd(D, E, F); + for (unsigned int i = 0; i < array_size; i++) { + if (C[i] != A[i] + B[i]) { + std::cout << "The results are incorrect (element " << i << " is " << C[i] + << "!\n"; + return 1; + } + if (F[i] != D[i] + E[i]) { + std::cout << "The results are incorrect (element " << i << " is " << F[i] + << "!\n"; + return 1; + } + } + std::cout << "The results are correct!\n"; + return 0; +} diff --git a/SYCL/Basic/aot/accelerator.cpp b/SYCL/Basic/aot/accelerator.cpp new file mode 100644 index 0000000000..8ebb75ac36 --- /dev/null +++ b/SYCL/Basic/aot/accelerator.cpp @@ -0,0 +1,13 @@ +//==----- accelerator.cpp - AOT compilation for fpga devices using aoc ------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===------------------------------------------------------------------------===// + +// REQUIRES: aoc, accelerator + +// RUN: %clangxx -fsycl -fsycl-targets=spir64_fpga-unknown-unknown-sycldevice %S/Inputs/aot.cpp -o %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// XFAIL: * diff --git a/SYCL/Basic/aot/cpu.cpp b/SYCL/Basic/aot/cpu.cpp new file mode 100644 index 0000000000..42ded976ff --- /dev/null +++ b/SYCL/Basic/aot/cpu.cpp @@ -0,0 +1,12 @@ +//==----- cpu.cpp - AOT compilation for cpu devices using opencl-aot --------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===------------------------------------------------------------------------===// + +// REQUIRES: opencl-aot, cpu + +// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64-unknown-unknown-sycldevice %S/Inputs/aot.cpp -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out diff --git a/SYCL/Basic/aot/gpu.cpp b/SYCL/Basic/aot/gpu.cpp new file mode 100644 index 0000000000..482a14eade --- /dev/null +++ b/SYCL/Basic/aot/gpu.cpp @@ -0,0 +1,14 @@ +//==----- gpu.cpp - AOT compilation for gen devices using GEN compiler ------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===------------------------------------------------------------------------===// + +// REQUIRES: ocloc, gpu +// UNSUPPORTED: cuda +// CUDA is not compatible with SPIR. + +// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend=spir64_gen-unknown-unknown-sycldevice "-device skl" %S/Inputs/aot.cpp -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out diff --git a/SYCL/Basic/aot/spec_const_aot.cpp b/SYCL/Basic/aot/spec_const_aot.cpp new file mode 100644 index 0000000000..99b451fe6d --- /dev/null +++ b/SYCL/Basic/aot/spec_const_aot.cpp @@ -0,0 +1,66 @@ +// REQUIRES: opencl-aot, cpu +// +// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64-unknown-unknown-sycldevice %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// +// The test checks that the specialization constant feature works with ahead +// of time compilation. + +#include + +#include +#include + +class MyInt32Const; + +using namespace sycl; + +class Kernel; + +int main(int argc, char **argv) { + cl::sycl::queue q(default_selector{}, [](exception_list l) { + for (auto ep : l) { + try { + std::rethrow_exception(ep); + } catch (cl::sycl::exception &e0) { + std::cout << e0.what(); + } catch (std::exception &e1) { + std::cout << e1.what(); + } catch (...) { + std::cout << "*** catch (...)\n"; + } + } + }); + + std::cout << "Running on " << q.get_device().get_info() << "\n"; + cl::sycl::program prog(q.get_context()); + + cl::sycl::experimental::spec_constant i32 = + prog.set_spec_constant(10); + + prog.build_with_kernel_type(); + + std::vector vec(1); + { + cl::sycl::buffer buf(vec.data(), vec.size()); + + q.submit([&](cl::sycl::handler &cgh) { + auto acc = buf.get_access(cgh); + cgh.single_task( + prog.get_kernel(), + [=]() { + acc[0] = i32.get(); + }); + }); + } + bool passed = true; + int val = vec[0]; + int gold = 0; // with AOT, spec constant is set to C++ default for the type + + if (val != gold) { + std::cout << "*** ERROR: " << val << " != " << gold << "(gold)\n"; + passed = false; + } + std::cout << (passed ? "passed\n" : "FAILED\n"); + return passed ? 0 : 1; +} diff --git a/SYCL/Basic/aot/with-llvm-bc.cpp b/SYCL/Basic/aot/with-llvm-bc.cpp new file mode 100644 index 0000000000..79af5d5836 --- /dev/null +++ b/SYCL/Basic/aot/with-llvm-bc.cpp @@ -0,0 +1,17 @@ +//==----- with-llvm-bc.cpp - SYCL kernel with LLVM IR bitcode as binary ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: cpu, dump_ir + +// RUN: %clangxx -fsycl -fsycl-targets=spir64-unknown-unknown-sycldevice -c %S/Inputs/aot.cpp -o %t.o +// RUN: %clangxx -fsycl -fsycl-link-targets=spir64-unknown-unknown-sycldevice %t.o -o %t.spv +// RUN: llvm-spirv -r %t.spv -o %t.bc +// RUN: %clangxx -fsycl -fsycl-add-targets=spir64:%t.bc %t.o -o %t.out +// +// Only CPU supports LLVM IR bitcode as a binary +// RUN: %CPU_RUN_PLACEHOLDER %t.out diff --git a/SYCL/Basic/bit_cast/bit_cast.cpp b/SYCL/Basic/bit_cast/bit_cast.cpp new file mode 100644 index 0000000000..e1fe40b793 --- /dev/null +++ b/SYCL/Basic/bit_cast/bit_cast.cpp @@ -0,0 +1,84 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include + +#include +#include +#include + +constexpr cl::sycl::access::mode sycl_write = cl::sycl::access::mode::write; + +template +class BitCastKernel; + +template +To doBitCast(const From &ValueToConvert) { + std::vector Vec(1); + { + sycl::buffer Buf(Vec.data(), 1); + sycl::queue Queue; + Queue.submit([&](sycl::handler &cgh) { + auto acc = Buf.template get_access(cgh); + cgh.single_task>([=]() { + // TODO: change to sycl::bit_cast in the future + acc[0] = sycl::detail::bit_cast(ValueToConvert); + }); + }); + } + return Vec[0]; +} + +template +int test(const From &Value) { + auto ValueConvertedTwoTimes = doBitCast(doBitCast(Value)); + bool isOriginalValueEqualsToConvertedTwoTimes = false; + if (std::is_integral::value) { + isOriginalValueEqualsToConvertedTwoTimes = Value == ValueConvertedTwoTimes; + } else if ((std::is_floating_point::value) || std::is_same::value) { + static const float Epsilon = 0.0000001f; + isOriginalValueEqualsToConvertedTwoTimes = fabs(Value - ValueConvertedTwoTimes) < Epsilon; + } else { + std::cerr << "Type " << typeid(From).name() << " neither integral nor floating point nor cl::sycl::half\n"; + return 1; + } + if (!isOriginalValueEqualsToConvertedTwoTimes) { + std::cerr << "FAIL: Original value which is " << Value << " != value converted two times which is " << ValueConvertedTwoTimes << "\n"; + return 1; + } + std::cout << "PASS\n"; + return 0; +} + +int main() { + int ReturnCode = 0; + + std::cout << "cl::sycl::half to unsigned short ...\n"; + ReturnCode += test(cl::sycl::half(1.0f)); + + std::cout << "unsigned short to cl::sycl::half ...\n"; + ReturnCode += test(static_cast(16384)); + + std::cout << "cl::sycl::half to short ...\n"; + ReturnCode += test(cl::sycl::half(1.0f)); + + std::cout << "short to cl::sycl::half ...\n"; + ReturnCode += test(static_cast(16384)); + + std::cout << "int to float ...\n"; + ReturnCode += test(static_cast(2)); + + std::cout << "float to int ...\n"; + ReturnCode += test(static_cast(-2.4f)); + + std::cout << "unsigned int to float ...\n"; + ReturnCode += test(static_cast(6)); + + std::cout << "float to unsigned int ...\n"; + ReturnCode += test(static_cast(-2.4f)); + + return ReturnCode; +} diff --git a/SYCL/Basic/built-ins/nan.cpp b/SYCL/Basic/built-ins/nan.cpp new file mode 100644 index 0000000000..5c0b2c3233 --- /dev/null +++ b/SYCL/Basic/built-ins/nan.cpp @@ -0,0 +1,72 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %clangxx -fsycl -D HALF_IS_SUPPORTED %s -o %t_gpu.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t_gpu.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// XFAIL: cuda +#include + +#include + +namespace s = cl::sycl; +using namespace std; + +template void test_nan_call() { + static_assert(is_same::value == Expected, ""); +} + +template struct test; + +template void check_nan(s::queue &Queue) { + R Data{0}; + s::vec VData{0}; + { + s::buffer Buf(&Data, s::range<1>(1)); + s::buffer, 1> VBuf(&VData, s::range<1>(1)); + Queue.submit([&](s::handler &Cgh) { + auto Acc = Buf.template get_access(Cgh); + auto VAcc = VBuf.template get_access(Cgh); + Cgh.single_task>([=]() { + Acc[0] = s::nan(T{0}); + VAcc[0] = s::nan(s::vec{0}); + }); + }); + Queue.wait_and_throw(); + } + assert(s::isnan(Data)); + assert(s::all(s::isnan(VData))); +} + +int main() { + test_nan_call(); + test_nan_call(); + test_nan_call(); + test_nan_call(); + test_nan_call(); + test_nan_call(); + test_nan_call(); + test_nan_call(); + + s::queue Queue([](cl::sycl::exception_list ExceptionList) { + for (cl::sycl::exception_ptr_class ExceptionPtr : ExceptionList) { + try { + std::rethrow_exception(ExceptionPtr); + } catch (cl::sycl::exception &E) { + std::cerr << E.what() << std::endl; + } catch (...) { + std::cerr << "Unknown async exception was caught." << std::endl; + } + } + }); +#ifdef HALF_IS_SUPPORTED + if (Queue.get_device().has_extension("cl_khr_fp16")) + check_nan(Queue); +#endif + check_nan(Queue); + if (Queue.get_device().has_extension("cl_khr_fp64")) { + check_nan(Queue); + check_nan(Queue); + } + return 0; +} diff --git a/SYCL/Basic/built-ins/printf.cpp b/SYCL/Basic/built-ins/printf.cpp new file mode 100644 index 0000000000..88d4e36b02 --- /dev/null +++ b/SYCL/Basic/built-ins/printf.cpp @@ -0,0 +1,134 @@ +// UNSUPPORTED: cuda +// CUDA does not support printf. +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out | FileCheck %s +// RUN: %CPU_RUN_PLACEHOLDER %t.out %CPU_CHECK_PLACEHOLDER +// RUN: %GPU_RUN_PLACEHOLDER %t.out %GPU_CHECK_PLACEHOLDER +// RUN: %ACC_RUN_PLACEHOLDER %t.out %ACC_CHECK_PLACEHOLDER +// XFAIL: cpu, accelerator +#include + +#include +#include + +using namespace cl::sycl; + +// According to OpenCL C spec, the format string must be in constant address +// space +#ifdef __SYCL_DEVICE_ONLY__ +#define CONSTANT __attribute__((opencl_constant)) +#else +#define CONSTANT +#endif + +// This is one of the possible ways to define a format string in a correct +// address space +static const CONSTANT char format_hello_world[] = "Hello, World!\n"; + +// Static isn't really needed if you define it in global scope +const CONSTANT char format_int[] = "%d\n"; + +static const CONSTANT char format_vec[] = "%d,%d,%d,%d\n"; + +const CONSTANT char format_hello_world_2[] = "%lu: Hello, World!\n"; + +int main() { + { + default_selector Selector; + queue Queue(Selector); + + Queue.submit([&](handler &CGH) { + CGH.single_task([=]() { + // String + intel::experimental::printf(format_hello_world); + // Due to a bug in Intel CPU Runtime for OpenCL on Windows, information + // printed using such format strings (without %-specifiers) might + // appear in different order if output is redirected to a file or + // another app + // FIXME: strictly check output order once the bug is fixed + // CHECK: {{(Hello, World!)?}} + + // Integral types + intel::experimental::printf(format_int, (int32_t)123); + intel::experimental::printf(format_int, (int32_t)-123); + // CHECK: 123 + // CHECK-NEXT: -123 + + // Floating point types + { + // You can declare format string in non-global scope, but in this case + // static keyword is required + static const CONSTANT char format[] = "%f\n"; + intel::experimental::printf(format, 33.4f); + intel::experimental::printf(format, -33.4f); + } + // CHECK-NEXT: 33.4 + // CHECK-NEXT: -33.4 + + // Vectors + cl::sycl::vec v4{5, 6, 7, 8}; +#ifdef __SYCL_DEVICE_ONLY__ + // On device side, vectors can be printed via native OpenCL types: + using ocl_int4 = cl::sycl::vec::vector_t; + { + static const CONSTANT char format[] = "%v4d\n"; + intel::experimental::printf(format, (ocl_int4)v4); + } + + // However, you are still able to print them by-element: + { + intel::experimental::printf(format_vec, (int32_t)v4.w(), + (int32_t)v4.z(), (int32_t)v4.y(), + (int32_t)v4.x()); + } +#else + // On host side you always have to print them by-element: + intel::experimental::printf(format_vec, (int32_t)v4.x(), + (int32_t)v4.y(), (int32_t)v4.z(), + (int32_t)v4.w()); + intel::experimental::printf(format_vec, (int32_t)v4.w(), + (int32_t)v4.z(), (int32_t)v4.y(), + (int32_t)v4.x()); +#endif // __SYCL_DEVICE_ONLY__ + // CHECK-NEXT: 5,6,7,8 + // CHECK-NEXT: 8,7,6,5 + + // Pointers + int a = 5; + int *Ptr = &a; + // According to OpenCL spec, argument should be a void pointer + { + static const CONSTANT char format[] = "%p\n"; + intel::experimental::printf(format, (void *)Ptr); + } + // CHECK-NEXT: {{(0x)?[0-9a-fA-F]+$}} + }); + }); + Queue.wait(); + + // printf in parallel_for + Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>(10), [=](id<1> i) { + // cast to uint64_t to be sure that we pass 64-bit unsigned value + intel::experimental::printf(format_hello_world_2, (uint64_t)i.get(0)); + }); + }); + Queue.wait(); + // CHECK-NEXT: {{[0-9]+}}: Hello, World! + // CHECK-NEXT: {{[0-9]+}}: Hello, World! + // CHECK-NEXT: {{[0-9]+}}: Hello, World! + // CHECK-NEXT: {{[0-9]+}}: Hello, World! + // CHECK-NEXT: {{[0-9]+}}: Hello, World! + // CHECK-NEXT: {{[0-9]+}}: Hello, World! + // CHECK-NEXT: {{[0-9]+}}: Hello, World! + // CHECK-NEXT: {{[0-9]+}}: Hello, World! + // CHECK-NEXT: {{[0-9]+}}: Hello, World! + // CHECK-NEXT: {{[0-9]+}}: Hello, World! + } + +// FIXME: strictly check output order once the bug mentioned above is fixed +// CHECK: {{(Hello, World!)?}} + + return 0; +} diff --git a/SYCL/Basic/built-ins/scalar_common.cpp b/SYCL/Basic/built-ins/scalar_common.cpp new file mode 100644 index 0000000000..89abf11c1a --- /dev/null +++ b/SYCL/Basic/built-ins/scalar_common.cpp @@ -0,0 +1,34 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// TODO: ptxas fatal : Unresolved extern function '_Z23__spirv_ocl_fmax_commonff' +// XFAIL: cuda + +#include + +#include + +namespace s = cl::sycl; + +int main() { + // max + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::max(s::cl_float{ 0.5f }, s::cl_float{ 2.3f }); + }); + }); + } + assert(r == 2.3f); + } + + return 0; +} diff --git a/SYCL/Basic/built-ins/scalar_geometric.cpp b/SYCL/Basic/built-ins/scalar_geometric.cpp new file mode 100644 index 0000000000..c63dcbbfc6 --- /dev/null +++ b/SYCL/Basic/built-ins/scalar_geometric.cpp @@ -0,0 +1,131 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// TODO: ptxas fatal : Unresolved extern function '_Z12__spirv_FMulff' +// XFAIL: cuda + +#include + +#include + +namespace s = cl::sycl; + +int main() { + // dot + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::dot(s::cl_float{ 0.5 }, s::cl_float{ 1.6 }); + }); + }); + } + assert(r == 0.8f); + } + + // distance + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::distance(s::cl_float{ 1.f }, s::cl_float{ 3.f }); + }); + }); + } + assert(r == 2.f); + } + + // length + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::length(s::cl_float{ 1.f }); + }); + }); + } + assert(r == 1.f); + } + + // normalize + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::normalize(s::cl_float{ 2.f }); + }); + }); + } + assert(r == 1.f); + } + + // fast_distance + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::fast_distance(s::cl_float{ 1.f }, s::cl_float{ 3.f }); + }); + }); + } + assert(r == 2.f); + } + + // fast_length + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::fast_length(s::cl_float{ 2.f }); + }); + }); + } + assert(r == 2.f); + } + + // fast_normalize + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::fast_normalize(s::cl_float{ 2.f }); + }); + }); + } + + assert(r == 1.f); + } + + return 0; +} diff --git a/SYCL/Basic/built-ins/scalar_integer.cpp b/SYCL/Basic/built-ins/scalar_integer.cpp new file mode 100644 index 0000000000..6a53654fb4 --- /dev/null +++ b/SYCL/Basic/built-ins/scalar_integer.cpp @@ -0,0 +1,571 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// TODO: ptxas fatal : Unresolved extern function '_Z17__spirv_ocl_s_maxii' +// XFAIL: cuda + +#include + +#include +#include + +namespace s = cl::sycl; + +int main() { + // max + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::max(s::cl_int{ 5 }, s::cl_int{ 2 }); + }); + }); + } + assert(r == 5); + } + + // max + { + s::cl_uint r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::max(s::cl_uint{ 5 }, s::cl_uint{ 2 }); + }); + }); + } + assert(r == 5); + } + + // min + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::min(s::cl_int{ 5 }, s::cl_int{ 2 }); + }); + }); + } + assert(r == 2); + } + + // min (longlong) + { + s::longlong r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::min(s::longlong{ 5 }, s::longlong{ 2 }); + }); + }); + } + assert(r == 2); + } + + // min + { + s::cl_uint r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::min(s::cl_uint{ 5 }, s::cl_uint{ 2 }); + }); + }); + } + assert(r == 2); + } + + // min (ulonglong) + { + s::ulonglong r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::min(s::ulonglong{ 5 }, s::ulonglong{ 2 }); + }); + }); + } + assert(r == 2); + } + + // abs + { + s::cl_uint r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::abs(s::cl_int{ -5 }); + }); + }); + } + assert(r == 5); + } + + // abs_diff + { + s::cl_uint r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::abs_diff(s::cl_int{ -5 }, s::cl_int{ -1 }); + }); + }); + } + assert(r == 4); + } + + // abs_diff(uchar) + { + s::cl_uchar r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::abs_diff(s::uchar{ 3 }, s::uchar{ 250 }); + }); + }); + } + assert(r == 247); + } + + // add_sat + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::add_sat(s::cl_int{ 0x7FFFFFFF }, s::cl_int{ 100 }); + }); + }); + } + assert(r == 0x7FFFFFFF); + } + + // hadd + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::hadd(s::cl_int{ 0x0000007F }, s::cl_int{ 0x00000020 }); + }); + }); + } + assert(r == 0x0000004F); + } + + // rhadd + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::rhadd(s::cl_int{ 0x0000007F }, s::cl_int{ 0x00000020 }); + }); + }); + } + assert(r == 0x50); + } + + // clamp + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::clamp(s::cl_int{ 5 }, s::cl_int{ 10 }, s::cl_int{ 30 }); + }); + }); + } + assert(r == 10); + } + + // clz + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::clz(s::cl_int{ 0x0FFFFFFF }); + }); + }); + } + assert(r == 4); + } + + // ctz + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::intel::ctz(s::cl_int{ 0x7FFFFFF0 }); + }); + }); + } + assert(r == 4); + } + + // mad_hi + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::mad_hi(s::cl_int{ 0x10000000 }, s::cl_int{ 0x00000100 }, + s::cl_int{ 0x00000001 }); + }); // 2^28 * 2^8 = 2^36 -> 0x10 00000000. + }); + } + assert(r == 0x11); + } + + // mad_sat + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::mad_sat(s::cl_int{ 0x10000000 }, s::cl_int{ 0x00000100 }, + s::cl_int{ 0x00000001 }); + }); // 2^31 * 2^8 = 2^39 -> 0x80 00000000 -> reuslt is saturated in the + // product. + }); + } + assert(r == 0x7FFFFFFF); + } + + // mad_sat test two + { + char r(0); + char exp(120); + { + cl::sycl::buffer buf(&r, cl::sycl::range<1>(1)); + cl::sycl::queue q; + q.submit([&](cl::sycl::handler &cgh) { + auto acc = buf.get_access(cgh); + cgh.single_task([=]() { + signed char inputData_0(-17); + signed char inputData_1(-10); + signed char inputData_2(-50); + acc[0] = cl::sycl::mad_sat(inputData_0, inputData_1, inputData_2); + }); + }); + } + assert(r == exp); // Should return the real number of i0*i1+i2 in CPU + // Only fails in vector, but passes in scalar. + + } + + // mul_hi + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::mul_hi(s::cl_int{ 0x10000000 }, s::cl_int{ 0x00000100 }); + }); // 2^28 * 2^8 = 2^36 -> 0x10 00000000. + }); + } + assert(r == 0x10); + } + + // mul_hi with negative result w/ carry + { + s::cl_int r{0}; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::mul_hi(s::cl_int{-0x10000000}, s::cl_int{0x00000100}); + }); // -2^28 * 2^8 = -2^36 -> -0x10 (FFFFFFF0) 00000000. + }); + } + assert(r == -0x10); + } + + // mul_hi with negative result w/o carry + { + s::cl_int r{0}; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::mul_hi(s::cl_int{-0x10000000}, s::cl_int{0x00000101}); + }); // -2^28 * (2^8 + 1) = -2^36 - 2^28 -> -0x11 (FFFFFFEF) -0x10000000 + // (F0000000). + }); + } + assert(r == -0x11); + } + + // rotate + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::rotate(s::cl_int{ 0x11100000 }, s::cl_int{ 12 }); + }); + }); + } + assert(r == 0x00000111); + } + + // rotate (with large rotate size) + { + s::cl_char r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::rotate(static_cast((unsigned char)0xe0), + s::cl_char{ 50 }); + }); + }); + } + assert((unsigned char)r == 0x83); + } + // sub_sat + { + auto TestSubSat = [](s::cl_int x, s::cl_int y) { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::sub_sat(x, y); + }); + }); + } + return r; + }; + // 10 - (-2^31(minimum value)) = saturates on Maximum value + s::cl_int r1 = TestSubSat(10, 0x80000000); + assert(r1 == 0x7FFFFFFF); + s::cl_int r2 = TestSubSat(0x7FFFFFFF, 0xFFFFFFFF); + assert(r2 == 0x7FFFFFFF); + s::cl_int r3 = TestSubSat(0x80000000, 0x00000001); + assert(r3 == 0x80000000); + s::cl_int r4 = TestSubSat(10499, 30678); + assert(r4 == -20179); + } + + // upsample - 1 + { + s::cl_ushort r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::upsample(s::cl_uchar{ 0x10 }, s::cl_uchar{ 0x10 }); + }); + }); + } + assert(r == 0x1010); + } + + // upsample - 2 + { + s::cl_short r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::upsample(s::cl_char{ 0x10 }, s::cl_uchar{ 0x10 }); + }); + }); + } + assert(r == 0x1010); + } + + // upsample - 3 + { + s::cl_uint r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::upsample(s::cl_ushort{ 0x0010 }, s::cl_ushort{ 0x0010 }); + }); + }); + } + assert(r == 0x00100010); + } + + // upsample - 4 + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::upsample(s::cl_short{ 0x0010 }, s::cl_ushort{ 0x0010 }); + }); + }); + } + assert(r == 0x00100010); + } + + // upsample - 5 + { + s::cl_ulong r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = + s::upsample(s::cl_uint{ 0x00000010 }, s::cl_uint{ 0x00000010 }); + }); + }); + } + assert(r == 0x0000001000000010); + } + + // upsample - 6 + { + s::cl_long r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = + s::upsample(s::cl_int{ 0x00000010 }, s::cl_uint{ 0x00000010 }); + }); + }); + } + assert(r == 0x0000001000000010); + } + + // popcount + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::popcount(s::cl_int{ 0x000000FF }); + }); + }); + } + assert(r == 8); + } + + // mad24 + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = + s::mad24(s::cl_int(0xFFFFFFFF), s::cl_int{ 20 }, s::cl_int{ 20 }); + }); + }); + } + assert(r == 0); + } + + // mul24 + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::mul24(s::cl_int(0xFFFFFFFF), s::cl_int{ 20 }); + }); + }); + } + assert(r == -20); + } + + return 0; +} diff --git a/SYCL/Basic/built-ins/scalar_math.cpp b/SYCL/Basic/built-ins/scalar_math.cpp new file mode 100644 index 0000000000..3be5be3d12 --- /dev/null +++ b/SYCL/Basic/built-ins/scalar_math.cpp @@ -0,0 +1,401 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include + +#include +#include +#include + +namespace s = cl::sycl; + +int main() { + // acos + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::acos(s::cl_float{ 0.5 }); + }); + }); + } + assert(r > 1.047f && r < 1.048f); // ~1.0471975511965979 + } + + // acosh + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::acosh(s::cl_float{ 2.4 }); + }); + }); + } + assert(r > 1.522f && r < 1.523f); // ~1.5220793674636532 + } + + // asin + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::asin(s::cl_float{ 0.5 }); + }); + }); + } + assert(r > 0.523f && r < 0.524f); // ~0.5235987755982989 + } + + // asinh + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::asinh(s::cl_float{ 0.5 }); + }); + }); + } + assert(r > 0.481f && r < 0.482f); // ~0.48121182505960347 + } + + // atan + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::atan(s::cl_float{ 0.5 }); + }); + }); + } + assert(r > 0.463f && r < 0.464f); // ~0.4636476090008061 + } + + // atanh + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::atanh(s::cl_float{ 0.5 }); + }); + }); + } + assert(r > 0.549f && r < 0.550f); // ~0.5493061443340549 + } + + // cbrt + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::cbrt(s::cl_float{ 27.0 }); + }); + }); + } + assert(r == 3.f); + } + + // ceil + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::ceil(s::cl_float{ 0.5 }); + }); + }); + } + assert(r == 1.f); + } + + // cos + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::cos(s::cl_float{ 0.5 }); + }); + }); + } + assert(r > 0.877f && r < 0.878f); // ~0.8775825618903728 + } + + // cosh + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::cosh(s::cl_float{ 0.5 }); + }); + }); + } + assert(r > 1.127f && r < 1.128f); // ~1.1276259652063807 + } + + // cospi + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::cospi(s::cl_float{ 0.1 }); + }); + }); + } + assert(r > 0.951f && r < 0.952f); // ~0.9510565162951535 + } + + // erfc + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::erfc(s::cl_float{ 0.5 }); + }); + }); + } + assert(r > 0.479f && r < 0.480f); // ~0.4795001221869535 + } + + // erf + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::erf(s::cl_float{ 0.5 }); + }); + }); + } + assert(r > 0.520f && r < 0.521f); // ~0.5204998778130465 + } + + // exp + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::exp(s::cl_float{ 0.5 }); + }); + }); + } + assert(r > 1.648f && r < 1.649f); // ~1.6487212707001282 + } + + // exp2 + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::exp2(s::cl_float{ 8.0 }); + }); + }); + } + assert(r == 256.0f); + } + + // exp10 + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::exp10(s::cl_float{ 2 }); + }); + }); + } + assert(r == 100.0f); + } + + // expm1 + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::expm1(s::cl_float{ 0.5 }); + }); + }); + } + assert(r > 0.648f && r < 0.649f); // ~0.6487212707001282 + } + + // fabs + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::fabs(s::cl_float{ -0.5 }); + }); + }); + } + assert(r == 0.5f); + } + + // floor + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::floor(s::cl_float{ 0.5 }); + }); + }); + } + assert(r == 0.f); + } + + // fmax + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::fmax(s::cl_float{ 0.5 }, s::cl_float{ 0.8 }); + }); + }); + } + assert(r == 0.8f); + } + + // fmin + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::fmin(s::cl_float{ 0.5 }, s::cl_float{ 0.8 }); + }); + }); + } + assert(r == 0.5f); + } + + // fmod + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::fmod(s::cl_float{ 5.1 }, s::cl_float{ 3.0 }); + }); + }); + } + assert(r == 2.1f); + } + + // lgamma with private memory + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::lgamma(s::cl_float{ 10.f }); + }); + }); + } + assert(r > 12.8017f && r < 12.8019f); // ~12.8018 + } + + // lgamma with private memory + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::lgamma(s::cl_float{ -2.4f }); + }); + }); + } + assert(r > 0.1024f && r < 0.1026f); // ~0.102583 + } + + return 0; +} diff --git a/SYCL/Basic/built-ins/scalar_math_2.cpp b/SYCL/Basic/built-ins/scalar_math_2.cpp new file mode 100644 index 0000000000..7273842486 --- /dev/null +++ b/SYCL/Basic/built-ins/scalar_math_2.cpp @@ -0,0 +1,244 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// TODO: ptxas fatal : Unresolved extern function '_Z18__spirv_ocl_acospif' +// XFAIL: cuda + +#include + +#include +#include +#include + +namespace s = cl::sycl; + +int main() { + + // acospi + { + s::cl_float r{0}; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::acospi(s::cl_float{0.5}); + }); + }); + } + assert(r > 0.333f && r < 0.334f); // ~0.33333333333333337 + } + + // asinpi + { + s::cl_float r{0}; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::asinpi(s::cl_float{0.5}); + }); + }); + } + assert(r > 0.166f && r < 0.167f); // ~0.16666666666666669 + } + + // atan2 + { + s::cl_float r{0}; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::atan2(s::cl_float{0.5}, s::cl_float{0.5}); + }); + }); + } + assert(r > 0.785f && r < 0.786f); // ~0.7853981633974483 + } + + // atanpi + { + s::cl_float r{0}; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::atanpi(s::cl_float{0.5}); + }); + }); + } + assert(r > 0.147f && r < 0.148f); // ~0.14758361765043326 + } + + // atan2pi + { + s::cl_float r{0}; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::atan2pi(s::cl_float{0.5}, s::cl_float{0.5}); + }); + }); + } + assert(r > 0.249f && r < 0.251f); // ~0.25 + } + + // copysign + { + s::cl_float r{0}; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::copysign(s::cl_float{1}, s::cl_float{-0.5}); + }); + }); + } + assert(r == -1.f); + } + + // fdim + { + s::cl_float r{0}; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::fdim(s::cl_float{1.6}, s::cl_float{0.6}); + }); + }); + } + assert(r == 1.0f); + } + + // fma + { + s::cl_float r{0}; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::fma(s::cl_float{0.5}, s::cl_float{10.0}, + s::cl_float{3.0}); + }); + }); + } + assert(r == 8.0f); + } + + // fract with global memory + { + s::cl_float r{0}; + s::cl_float i{999}; + { + s::buffer BufR(&r, s::range<1>(1)); + s::buffer BufI(&i, s::range<1>(1), + {s::property::buffer::use_host_ptr()}); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + auto AccI = BufI.get_access(cgh); + cgh.single_task([=]() { + s::global_ptr Iptr(AccI); + AccR[0] = s::fract(s::cl_float{1.5}, Iptr); + }); + }); + } + assert(r == 0.5f); + assert(i == 1.0f); + } + + // fract with private memory + { + s::cl_float r{0}; + s::cl_float i{999}; + { + s::buffer BufR(&r, s::range<1>(1)); + s::buffer BufI(&i, s::range<1>(1), + {s::property::buffer::use_host_ptr()}); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + auto AccI = BufI.get_access(cgh); + cgh.single_task([=]() { + s::cl_float temp(0.0); + s::private_ptr Iptr(&temp); + AccR[0] = s::fract(s::cl_float{1.5f}, Iptr); + AccI[0] = *Iptr; + }); + }); + } + assert(r == 0.5f); + assert(i == 1.0f); + } + + // lgamma_r with private memory + { + s::cl_float r{0}; + s::cl_int i{999}; + { + s::buffer BufR(&r, s::range<1>(1)); + s::buffer BufI(&i, s::range<1>(1), + {s::property::buffer::use_host_ptr()}); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + auto AccI = BufI.get_access(cgh); + cgh.single_task([=]() { + s::cl_int temp(0.0); + s::private_ptr Iptr(&temp); + AccR[0] = s::lgamma_r(s::cl_float{10.f}, Iptr); + AccI[0] = *Iptr; + }); + }); + } + assert(r > 12.8017f && r < 12.8019f); // ~12.8018 + assert(i == 1); // tgamma of 10 is ~362880.0 + } + + // lgamma_r with private memory + { + s::cl_float r{0}; + s::cl_int i{999}; + { + s::buffer BufR(&r, s::range<1>(1)); + s::buffer BufI(&i, s::range<1>(1), + {s::property::buffer::use_host_ptr()}); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + auto AccI = BufI.get_access(cgh); + cgh.single_task([=]() { + s::cl_int temp(0.0); + s::private_ptr Iptr(&temp); + AccR[0] = s::lgamma_r(s::cl_float{-2.4f}, Iptr); + AccI[0] = *Iptr; + }); + }); + } + assert(r > 0.1024f && r < 0.1026f); // ~0.102583 + assert(i == -1); // tgamma of -2.4 is ~-1.1080299470333461 + } + + return 0; +} diff --git a/SYCL/Basic/built-ins/scalar_relational.cpp b/SYCL/Basic/built-ins/scalar_relational.cpp new file mode 100644 index 0000000000..a3c7b1d7df --- /dev/null +++ b/SYCL/Basic/built-ins/scalar_relational.cpp @@ -0,0 +1,422 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// TODO: ptxas fatal : Unresolved extern function '_Z17__spirv_FOrdEqualff' +// XFAIL: cuda + +#include + +#include +#include + +namespace s = cl::sycl; + +int main() { + // isequal-float + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::isequal(s::cl_float{ 10.5f }, s::cl_float{ 10.5f }); + }); + }); + } + assert(r == 1); + } + + // isnotequal-float + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::isnotequal(s::cl_float{ 0.4f }, s::cl_float{ 0.5f }); + }); + }); + } + assert(r == 1); + } + + // isgreater-float + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::isgreater(s::cl_float{ 0.6f }, s::cl_float{ 0.5f }); + }); + }); + } + assert(r == 1); + } + + // isgreaterequal-float + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::isgreaterequal(s::cl_float{ 0.5f }, s::cl_float{ 0.5f }); + }); + }); + } + assert(r == 1); + } + + // isless-float + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::isless(s::cl_float{ 0.4f }, s::cl_float{ 0.5f }); + }); + }); + } + assert(r == 1); + } + + // islessequal-float + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::islessequal(s::cl_float{ 0.5f }, s::cl_float{ 0.5f }); + }); + }); + } + assert(r == 1); + } + + // islessgreater-float + { + s::cl_int r{ 1 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::islessgreater(s::cl_float{ 0.5f }, s::cl_float{ 0.5f }); + }); + }); + } + assert(r == 0); + } + + // isfinite-float + { + s::cl_int r{ 1 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::isfinite(s::cl_float{ NAN }); + }); + }); + } + assert(r == 0); + } + + // isinf-float + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::isinf(s::cl_float{ INFINITY }); + }); + }); + } + assert(r == 1); + } + + // isnan-float + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::isnan(s::cl_float{ NAN }); + }); + }); + } + assert(r == 1); + } + + // isnormal-float + { + s::cl_int r{ 1 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::isnormal(s::cl_float{ INFINITY }); + }); + }); + } + assert(r == 0); + } + + // isnormal-double + { + s::cl_int r{ 1 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::isnormal(s::cl_double{ INFINITY }); + }); + }); + } + assert(r == 0); + } + + // isordered-float + { + s::cl_int r{ 1 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::isordered(s::cl_float{ 4.0f }, s::cl_float{ NAN }); + }); + }); + } + assert(r == 0); + } + + // isunordered-float + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::isunordered(s::cl_float{ 4.0f }, s::cl_float{ NAN }); + }); + }); + } + assert(r == 1); + } + + // signbit-float + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::signbit(s::cl_float{ -12.0f }); + }); + }); + } + assert(r == 1); + } + + // any-integer + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::any(s::cl_int{ 12 }); + }); + }); + } + assert(r == 0); + } + // any-integer + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::any(s::cl_int{ 0 }); + }); + }); + } + assert(r == 0); + } + + // any-integer + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::any(s::cl_int{ -12 }); + }); + }); + } + assert(r == 1); + } + + // all-integer + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::all(s::cl_int{ 12 }); + }); + }); + } + assert(r == 0); + } + + // all-integer + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::all(s::cl_int{ 0 }); + }); + }); + } + assert(r == 0); + } + + // all-integer + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::all(s::cl_int{ -12 }); + }); + }); + } + assert(r == 1); + } + + // bitselect-float + { + s::cl_float r{ 0.0f }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::bitselect(s::cl_float{ 112.112 }, s::cl_float{ 34.34 }, + s::cl_float{ 3.3 }); + }); + }); + } + assert(r <= 80.5478 && r >= 80.5476); // r = 80.5477 + } + + // select-float,int + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::select(s::cl_float{ 34.34 }, s::cl_float{ 123.123 }, + s::cl_int{ 1 }); + }); + }); + } + assert(r <= 123.124 && r >= 123.122); // r = 123.123 + } + + // select-float,int + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::select(s::cl_float{ 34.34 }, s::cl_float{ 123.123 }, + s::cl_int{ 0 }); + }); + }); + } + assert(r <= 34.35 && r >= 34.33); // r = 34.34 + } + + // select-float,int + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::select(s::cl_float{ 34.34 }, s::cl_float{ 123.123 }, + s::cl_int{ -1 }); + }); + }); + } + assert(r <= 123.124 && r >= 123.122); // r = 123.123 + } + + return 0; +} diff --git a/SYCL/Basic/built-ins/vector_common.cpp b/SYCL/Basic/built-ins/vector_common.cpp new file mode 100644 index 0000000000..646d7a3ef5 --- /dev/null +++ b/SYCL/Basic/built-ins/vector_common.cpp @@ -0,0 +1,57 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// TODO: ptxas fatal : Unresolved extern function '_Z23__spirv_ocl_fmax_commonDv2_fS_' +// XFAIL: cuda + +#include + +#include + +namespace s = cl::sycl; + +int main() { + // max + { + s::cl_float2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = + s::max(s::cl_float2{ 0.5f, 3.4f }, s::cl_float2{ 2.3f, 0.4f }); + }); + }); + } + s::cl_float r1 = r.x(); + s::cl_float r2 = r.y(); + assert(r1 == 2.3f); + assert(r2 == 3.4f); + } + + // max + { + s::cl_float2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::max(s::cl_float2{ 0.5f, 3.4f }, s::cl_float{ 3.0f }); + }); + }); + } + s::cl_float r1 = r.x(); + s::cl_float r2 = r.y(); + assert(r1 == 3.0f); + assert(r2 == 3.4f); + } + + return 0; +} diff --git a/SYCL/Basic/built-ins/vector_geometric.cpp b/SYCL/Basic/built-ins/vector_geometric.cpp new file mode 100644 index 0000000000..deb3048019 --- /dev/null +++ b/SYCL/Basic/built-ins/vector_geometric.cpp @@ -0,0 +1,171 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// TODO: ptxas fatal : Unresolved extern function '_Z11__spirv_DotDv2_fS_' +// XFAIL: cuda + +#include + +#include +#include + +namespace s = cl::sycl; + +bool isFloatEqualTo(float x, float y, float epsilon = 0.005f) { + return std::fabs(x - y) <= epsilon; +} + +int main() { + // dot + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::dot(s::cl_float2{ 1.f, 2.f, }, s::cl_float2{ 4.f, 6.f }); + }); + }); + } + assert(r == 16.f); + } + + // cross + { + s::cl_float4 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::cross(s::cl_float4{ 2.f, 3.f, 4.f, 0.f, }, + s::cl_float4{ 5.f, 6.f, 7.f, 0.f, }); + }); + }); + } + + s::cl_float r1 = r.x(); + s::cl_float r2 = r.y(); + s::cl_float r3 = r.z(); + s::cl_float r4 = r.w(); + + assert(r1 == -3.f); + assert(r2 == 6.f); + assert(r3 == -3.f); + assert(r4 == 0.0f); + } + + // distance + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = + s::distance(s::cl_float2{ 1.f, 2.f, }, s::cl_float2{ 3.f, 4.f, }); + }); + }); + } + assert(isFloatEqualTo(r, 2.82843f)); + } + + // length + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::length(s::cl_float2{ 1.f, 2.f, }); + }); + }); + } + assert(isFloatEqualTo(r, 2.23607f)); + } + + // normalize + { + s::cl_float2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::normalize(s::cl_float2{ 1.f, 2.f, }); + }); + }); + } + s::cl_float r1 = r.x(); + s::cl_float r2 = r.y(); + + assert(isFloatEqualTo(r1, 0.447214f)); + assert(isFloatEqualTo(r2, 0.894427f)); + } + + // fast_distance + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::fast_distance(s::cl_float2{ 1.f, 2.f, }, + s::cl_float2{ 3.f, 4.f, }); + }); + }); + } + assert(isFloatEqualTo(r, 2.82843f)); + } + + // fast_length + { + s::cl_float r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::fast_length(s::cl_float2{ 1.f, 2.f, }); + }); + }); + } + assert(isFloatEqualTo(r, 2.23607f)); + } + + // fast_normalize + { + s::cl_float2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::fast_normalize(s::cl_float2{ 1.f, 2.f, }); + }); + }); + } + s::cl_float r1 = r.x(); + s::cl_float r2 = r.y(); + + assert(isFloatEqualTo(r1, 0.447144)); + assert(isFloatEqualTo(r2, 0.894287)); + } + + return 0; +} diff --git a/SYCL/Basic/built-ins/vector_integer.cpp b/SYCL/Basic/built-ins/vector_integer.cpp new file mode 100644 index 0000000000..3ce8bf49d0 --- /dev/null +++ b/SYCL/Basic/built-ins/vector_integer.cpp @@ -0,0 +1,701 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// TODO: ptxas fatal : Unresolved extern function '_Z17__spirv_ocl_s_maxDv2_iS_' +// XFAIL: cuda + +#include + +#include +#include + +namespace s = cl::sycl; + +int main() { + // max + { + s::cl_int2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::max(s::cl_int2{ 5, 3 }, s::cl_int2{ 2, 7 }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + assert(r1 == 5); + assert(r2 == 7); + } + + // max + { + s::cl_uint2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::max(s::cl_uint2{ 5, 3 }, s::cl_uint2{ 2, 7 }); + }); + }); + } + s::cl_uint r1 = r.x(); + s::cl_uint r2 = r.y(); + assert(r1 == 5); + assert(r2 == 7); + } + + // max + { + s::cl_int2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::max(s::cl_int2{ 5, 3 }, s::cl_int{ 2 }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + assert(r1 == 5); + assert(r2 == 3); + } + + // max (longlong2) + { + s::longlong2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::max(s::longlong2{ 5, 3 }, s::longlong{ 2 }); + }); + }); + } + s::longlong r1 = r.x(); + s::longlong r2 = r.y(); + assert(r1 == 5); + assert(r2 == 3); + } + + // max + { + s::cl_uint2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::max(s::cl_uint2{ 5, 3 }, s::cl_uint{ 2 }); + }); + }); + } + s::cl_uint r1 = r.x(); + s::cl_uint r2 = r.y(); + assert(r1 == 5); + assert(r2 == 3); + } + + // max (ulonglong2) + { + s::ulonglong2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::max(s::ulonglong2{ 5, 3 }, s::ulonglong{ 2 }); + }); + }); + } + s::ulonglong r1 = r.x(); + s::ulonglong r2 = r.y(); + assert(r1 == 5); + assert(r2 == 3); + } + + // min + { + s::cl_int2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::min(s::cl_int2{ 5, 3 }, s::cl_int2{ 2, 7 }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + assert(r1 == 2); + assert(r2 == 3); + } + + // min + { + s::cl_uint2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::min(s::cl_uint2{ 5, 3 }, s::cl_uint2{ 2, 7 }); + }); + }); + } + s::cl_uint r1 = r.x(); + s::cl_uint r2 = r.y(); + assert(r1 == 2); + assert(r2 == 3); + } + + // min + { + s::cl_int2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::min(s::cl_int2{ 5, 3 }, s::cl_int{ 2 }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + assert(r1 == 2); + assert(r2 == 2); + } + + // min + { + s::cl_uint2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::min(s::cl_uint2{ 5, 3 }, s::cl_uint{ 2 }); + }); + }); + } + s::cl_uint r1 = r.x(); + s::cl_uint r2 = r.y(); + assert(r1 == 2); + assert(r2 == 2); + } + + // abs + { + s::cl_uint2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::abs(s::cl_int2{ -5, -2 }); + }); + }); + } + s::cl_uint r1 = r.x(); + s::cl_uint r2 = r.y(); + assert(r1 == 5); + assert(r2 == 2); + } + + // abs (longlong) + { + s::ulonglong2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::abs(s::longlong2{ -5, -2 }); + }); + }); + } + s::ulonglong r1 = r.x(); + s::ulonglong r2 = r.y(); + assert(r1 == 5); + assert(r2 == 2); + } + + // abs_diff + { + s::cl_uint2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::abs_diff(s::cl_int2{ -5, -2 }, s::cl_int2{ -1, -1 }); + }); + }); + } + s::cl_uint r1 = r.x(); + s::cl_uint r2 = r.y(); + assert(r1 == 4); + assert(r2 == 1); + } + + // add_sat + { + s::cl_int2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::add_sat(s::cl_int2{ 0x7FFFFFFF, 0x7FFFFFFF }, + s::cl_int2{ 100, 90 }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + assert(r1 == 0x7FFFFFFF); + assert(r2 == 0x7FFFFFFF); + } + + // hadd + { + s::cl_int2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::hadd(s::cl_int2{ 0x0000007F, 0x0000007F }, + s::cl_int2{ 0x00000020, 0x00000020 }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + assert(r1 == 0x0000004F); + assert(r2 == 0x0000004F); + } + + // rhadd + { + s::cl_int2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::rhadd(s::cl_int2{ 0x0000007F, 0x0000007F }, + s::cl_int2{ 0x00000020, 0x00000020 }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + assert(r1 == 0x00000050); + assert(r2 == 0x00000050); + } + + // clamp - 1 + { + s::cl_int2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::clamp(s::cl_int2{ 5, 5 }, s::cl_int2{ 10, 10 }, + s::cl_int2{ 30, 30 }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + assert(r1 == 10); + assert(r2 == 10); + } + + // clamp - 2 + { + s::cl_int2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = + s::clamp(s::cl_int2{ 5, 5 }, s::cl_int{ 10 }, s::cl_int{ 30 }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + assert(r1 == 10); + assert(r2 == 10); + } + + // clz + { + s::cl_int2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::clz(s::cl_int2{ 0x0FFFFFFF, 0x0FFFFFFF }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + assert(r1 == 4); + assert(r2 == 4); + } + + // ctz + { + s::cl_int2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::intel::ctz(s::cl_int2{ 0x7FFFFFF0, 0x7FFFFFF0 }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + assert(r1 == 4); + assert(r2 == 4); + } + + // mad_hi + { + s::cl_int2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::mad_hi(s::cl_int2{ 0x10000000, 0x10000000 }, + s::cl_int2{ 0x00000100, 0x00000100 }, + s::cl_int2{ 1, 1 }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + assert(r1 == 0x11); + assert(r2 == 0x11); + } + + // mad_sat + { + s::cl_int2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::mad_sat(s::cl_int2{ 0x10000000, 0x10000000 }, + s::cl_int2{ 0x00000100, 0x00000100 }, + s::cl_int2{ 1, 1 }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + assert(r1 == 0x7FFFFFFF); + assert(r2 == 0x7FFFFFFF); + } + + // mul_hi + { + s::cl_int2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::mul_hi(s::cl_int2{ 0x10000000, 0x10000000 }, + s::cl_int2{ 0x00000100, 0x00000100 }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + assert(r1 == 0x10); + assert(r2 == 0x10); + } + + // rotate + { + s::cl_int2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::rotate(s::cl_int2{ 0x11100000, 0x11100000 }, + s::cl_int2{ 12, 12 }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + assert(r1 == 0x00000111); + assert(r2 == 0x00000111); + } + + // sub_sat + { + auto TestSubSat = [](s::cl_int2 x, s::cl_int2 y) { + s::cl_int2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::sub_sat(x, y); + }); + }); + } + return r; + }; + s::cl_int2 r1 = TestSubSat(s::cl_int2{ 10, 10 }, + s::cl_int2{ 0x80000000, 0x80000000 }); + s::cl_int r1x = r1.x(); + s::cl_int r1y = r1.y(); + assert(r1x == 0x7FFFFFFF); + assert(r1y == 0x7FFFFFFF); + s::cl_int2 r2 = TestSubSat(s::cl_int2{ 0x7FFFFFFF, 0x80000000 }, + s::cl_int2{ 0xFFFFFFFF, 0x00000001 }); + s::cl_int r2x = r2.x(); + s::cl_int r2y = r2.y(); + assert(r2x == 0x7FFFFFFF); + assert(r2y == 0x80000000); + s::cl_int2 r3 = TestSubSat(s::cl_int2{ 10499, 30678 }, + s::cl_int2{ 30678, 10499 }); + s::cl_int r3x = r3.x(); + s::cl_int r3y = r3.y(); + assert(r3x == -20179); + assert(r3y == 20179); + } + + // upsample - 1 + { + s::cl_ushort2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::upsample(s::cl_uchar2{ 0x10, 0x10 }, + s::cl_uchar2{ 0x10, 0x10 }); + }); + }); + } + s::cl_ushort r1 = r.x(); + s::cl_ushort r2 = r.y(); + assert(r1 == 0x1010); + assert(r2 == 0x1010); + } + + // upsample - 2 + { + s::cl_short2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::upsample(s::cl_char2{ 0x10, 0x10 }, + s::cl_uchar2{ 0x10, 0x10 }); + }); + }); + } + s::cl_short r1 = r.x(); + s::cl_short r2 = r.y(); + assert(r1 == 0x1010); + assert(r2 == 0x1010); + } + + // upsample - 3 + { + s::cl_uint2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::upsample(s::cl_ushort2{ 0x0010, 0x0010 }, + s::cl_ushort2{ 0x0010, 0x0010 }); + }); + }); + } + s::cl_uint r1 = r.x(); + s::cl_uint r2 = r.y(); + assert(r1 == 0x00100010); + assert(r2 == 0x00100010); + } + + // upsample - 4 + { + s::cl_int2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::upsample(s::cl_short2{ 0x0010, 0x0010 }, + s::cl_ushort2{ 0x0010, 0x0010 }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + assert(r1 == 0x00100010); + assert(r2 == 0x00100010); + } + + // upsample - 5 + { + s::cl_ulong2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::upsample(s::cl_uint2{ 0x00000010, 0x00000010 }, + s::cl_uint2{ 0x00000010, 0x00000010 }); + }); + }); + } + s::cl_ulong r1 = r.x(); + s::cl_ulong r2 = r.y(); + assert(r1 == 0x0000001000000010); + assert(r2 == 0x0000001000000010); + } + + // upsample - 6 + { + s::cl_long2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::upsample(s::cl_int2{ 0x00000010, 0x00000010 }, + s::cl_uint2{ 0x00000010, 0x00000010 }); + }); + }); + } + s::cl_long r1 = r.x(); + s::cl_long r2 = r.y(); + assert(r1 == 0x0000001000000010); + assert(r2 == 0x0000001000000010); + } + + // popcount + { + s::cl_int2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::popcount(s::cl_int2{ 0x000000FF, 0x000000FF }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + assert(r1 == 8); + assert(r2 == 8); + } + + // mad24 + { + s::cl_int2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::mad24(s::cl_int2{ 0xFFFFFFFF, 0xFFFFFFFF }, + s::cl_int2{ 20, 20 }, s::cl_int2{ 20, 20 }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + assert(r1 == 0); + assert(r2 == 0); + } + + // mul24 + { + s::cl_int2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::mul24(s::cl_int2{ 0xFFFFFFFF, 0xFFFFFFFF }, + s::cl_int2{ 20, 20 }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + assert(r1 == -20); + assert(r2 == -20); + } + + return 0; +} diff --git a/SYCL/Basic/built-ins/vector_math.cpp b/SYCL/Basic/built-ins/vector_math.cpp new file mode 100644 index 0000000000..951f2c9070 --- /dev/null +++ b/SYCL/Basic/built-ins/vector_math.cpp @@ -0,0 +1,210 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// TODO: ptxas fatal : Unresolved extern function '_Z17__spirv_ocl_fractDv2_fPU3AS0S_' +// XFAIL: cuda + +#include + +#include +#include +#include + +namespace s = cl::sycl; + +int main() { + // fmin + { + s::cl_float2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = + s::fmin(s::cl_float2{ 0.5f, 3.4f }, s::cl_float2{ 2.3f, 0.4f }); + }); + }); + } + s::cl_float r1 = r.x(); + s::cl_float r2 = r.y(); + assert(r1 == 0.5f); + assert(r2 == 0.4f); + } + + // fabs + { + s::cl_float2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::fabs(s::cl_float2{ -1.0f, 2.0f }); + }); + }); + } + s::cl_float r1 = r.x(); + s::cl_float r2 = r.y(); + assert(r1 == 1.0f); + assert(r2 == 2.0f); + } + + // floor + { + s::cl_float2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::floor(s::cl_float2{ 1.4f, 2.8f }); + }); + }); + } + s::cl_float r1 = r.x(); + s::cl_float r2 = r.y(); + assert(r1 == 1.0f); + assert(r2 == 2.0f); + } + + // ceil + { + s::cl_float2 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::ceil(s::cl_float2{ 1.4f, 2.8f }); + }); + }); + } + s::cl_float r1 = r.x(); + s::cl_float r2 = r.y(); + assert(r1 == 2); + assert(r2 == 3); + } + + // fract with global memory + { + s::cl_float2 r{ 0, 0 }; + s::cl_float2 i{ 0, 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::buffer BufI(&i, s::range<1>(1)); + + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + auto AccI = BufI.get_access(cgh); + cgh.single_task([=]() { + s::global_ptr Iptr(AccI); + AccR[0] = s::fract(s::cl_float2{ 1.5f, 2.5f }, Iptr); + }); + }); + } + + s::cl_float r1 = r.x(); + s::cl_float r2 = r.y(); + s::cl_float i1 = i.x(); + s::cl_float i2 = i.y(); + + assert(r1 == 0.5f); + assert(r2 == 0.5f); + assert(i1 == 1.0f); + assert(i2 == 2.0f); + } + + // fract with private memory + { + s::cl_float2 r{ 0, 0 }; + s::cl_float2 i{ 0, 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::buffer BufI(&i, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + auto AccI = BufI.get_access(cgh); + cgh.single_task([=]() { + s::cl_float2 temp(0.0); + s::private_ptr Iptr(&temp); + AccR[0] = s::fract(s::cl_float2{ 1.5f, 2.5f }, Iptr); + AccI[0] = *Iptr; + }); + }); + } + + s::cl_float r1 = r.x(); + s::cl_float r2 = r.y(); + s::cl_float i1 = i.x(); + s::cl_float i2 = i.y(); + + assert(r1 == 0.5f); + assert(r2 == 0.5f); + assert(i1 == 1.0f); + assert(i2 == 2.0f); + } + + // lgamma with private memory + { + s::cl_float2 r{ 0, 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::lgamma(s::cl_float2{ 10.f, -2.4f }); + }); + }); + } + + s::cl_float r1 = r.x(); + s::cl_float r2 = r.y(); + + assert(r1 > 12.8017f && r1 < 12.8019f); // ~12.8018 + assert(r2 > 0.1024f && r2 < 0.1026f); // ~0.102583 + } + + // lgamma_r with private memory + { + s::cl_float2 r{ 0, 0 }; + s::cl_int2 i{ 0, 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::buffer BufI(&i, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + auto AccI = BufI.get_access(cgh); + cgh.single_task([=]() { + s::cl_int2 temp(0.0); + s::private_ptr Iptr(&temp); + AccR[0] = s::lgamma_r(s::cl_float2{ 10.f, -2.4f }, Iptr); + AccI[0] = *Iptr; + }); + }); + } + + s::cl_float r1 = r.x(); + s::cl_float r2 = r.y(); + s::cl_int i1 = i.x(); + s::cl_int i2 = i.y(); + + assert(r1 > 12.8017f && r1 < 12.8019f); // ~12.8018 + assert(r2 > 0.1024f && r2 < 0.1026f); // ~0.102583 + assert(i1 == 1); // tgamma of 10 is ~362880.0 + assert(i2 == -1); // tgamma of -2.4 is ~-1.1080299470333461 + } + + return 0; +} diff --git a/SYCL/Basic/built-ins/vector_relational.cpp b/SYCL/Basic/built-ins/vector_relational.cpp new file mode 100644 index 0000000000..de87a0e67a --- /dev/null +++ b/SYCL/Basic/built-ins/vector_relational.cpp @@ -0,0 +1,608 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// TODO: ptxas fatal : Ptx assembly aborted due to errors +// XFAIL: cuda + +#include + +#include +#include +#include + +namespace s = cl::sycl; + +int main() { + // isequal + { + s::cl_int4 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::isequal(s::cl_float4{ 0.5f, 0.6f, NAN, INFINITY }, + s::cl_float4{ 0.5f, 0.5f, 0.5f, 0.5f }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + s::cl_int r3 = r.z(); + s::cl_int r4 = r.w(); + + assert(r1 == -1); + assert(r2 == 0); + assert(r3 == 0); + assert(r4 == 0); + } + + // isnotequal + { + s::cl_int4 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::isnotequal(s::cl_float4{ 0.5f, 0.6f, NAN, INFINITY }, + s::cl_float4{ 0.5f, 0.5f, 0.5f, 0.5f }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + s::cl_int r3 = r.z(); + s::cl_int r4 = r.w(); + + assert(r1 == 0); + assert(r2 == -1); + assert(r3 == -1); + assert(r4 == -1); + } + + // isgreater + { + s::cl_int4 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::isgreater(s::cl_float4{ 0.5f, 0.6f, NAN, INFINITY }, + s::cl_float4{ 0.5f, 0.5f, 0.5f, 0.5f }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + s::cl_int r3 = r.z(); + s::cl_int r4 = r.w(); + + assert(r1 == 0); + assert(r2 == -1); + assert(r3 == 0); + assert(r4 == -1); + } + + // isgreaterequal + { + s::cl_int4 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::isgreaterequal(s::cl_float4{ 0.5f, 0.6f, NAN, INFINITY }, + s::cl_float4{ 0.5f, 0.5f, 0.5f, 0.5f }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + s::cl_int r3 = r.z(); + s::cl_int r4 = r.w(); + + assert(r1 == -1); + assert(r2 == -1); + assert(r3 == 0); + assert(r4 == -1); + } + + // isless + { + s::cl_int4 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::isless(s::cl_float4{ 0.5f, 0.4f, NAN, INFINITY }, + s::cl_float4{ 0.5f, 0.5f, 0.5f, 0.5f }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + s::cl_int r3 = r.z(); + s::cl_int r4 = r.w(); + + assert(r1 == 0); + assert(r2 == -1); + assert(r3 == 0); + assert(r4 == 0); + } + + // islessequal + { + s::cl_int4 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::islessequal(s::cl_float4{ 0.5f, 0.4f, NAN, INFINITY }, + s::cl_float4{ 0.5f, 0.5f, 0.5f, 0.5f }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + s::cl_int r3 = r.z(); + s::cl_int r4 = r.w(); + + assert(r1 == -1); + assert(r2 == -1); + assert(r3 == 0); + assert(r4 == 0); + } + + // islessgreater + { + s::cl_int4 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = + s::islessgreater(s::cl_float4{ 0.5f, 0.4f, NAN, INFINITY }, + s::cl_float4{ 0.5f, 0.5f, 0.5f, INFINITY }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + s::cl_int r3 = r.z(); + s::cl_int r4 = r.w(); + + assert(r1 == 0); + assert(r2 == -1); + assert(r3 == 0); + assert(r4 == 0); // Infinity is considered as greater than any + // other value except Infinity. + } + + // isfinite + { + s::cl_int4 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::isfinite(s::cl_float4{ 0.5f, 0.4f, NAN, INFINITY }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + s::cl_int r3 = r.z(); + s::cl_int r4 = r.w(); + + assert(r1 == -1); + assert(r2 == -1); + assert(r3 == 0); + assert(r4 == 0); + } + + // isinf + { + s::cl_int4 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::isinf(s::cl_float4{ 0.5f, 0.4f, NAN, INFINITY }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + s::cl_int r3 = r.z(); + s::cl_int r4 = r.w(); + + assert(r1 == 0); + assert(r2 == 0); + assert(r3 == 0); + assert(r4 == -1); + } + + // isnan + { + s::cl_int4 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::isnan(s::cl_float4{ 0.5f, 0.4f, NAN, INFINITY }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + s::cl_int r3 = r.z(); + s::cl_int r4 = r.w(); + + assert(r1 == 0); + assert(r2 == 0); + assert(r3 == -1); + assert(r4 == 0); + } + + // isnormal + { + s::cl_int4 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::isnormal(s::cl_float4{ 0.5f, 0.4f, NAN, INFINITY }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + s::cl_int r3 = r.z(); + s::cl_int r4 = r.w(); + + assert(r1 == -1); + assert(r2 == -1); + assert(r3 == 0); + assert(r4 == 0); + } + + // isordered + { + s::cl_int4 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::isordered(s::cl_float4{ 0.5f, 0.6f, NAN, INFINITY }, + s::cl_float4{ 0.5f, 0.5f, 0.5f, 0.5f }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + s::cl_int r3 = r.z(); + s::cl_int r4 = r.w(); + + assert(r1 == -1); + assert(r2 == -1); + assert(r3 == 0); + assert(r4 == -1); // infinity is ordered. + } + + // isunordered + { + s::cl_int4 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::isunordered(s::cl_float4{ 0.5f, 0.6f, NAN, INFINITY }, + s::cl_float4{ 0.5f, 0.5f, 0.5f, 0.5f }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + s::cl_int r3 = r.z(); + s::cl_int r4 = r.w(); + + assert(r1 == 0); + assert(r2 == 0); + assert(r3 == -1); + assert(r4 == 0); + } + + // signbit + { + s::cl_int4 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::signbit(s::cl_float4{ 0.5f, -12.0f, NAN, INFINITY }); + }); + }); + } + s::cl_int r1 = r.x(); + s::cl_int r2 = r.y(); + s::cl_int r3 = r.z(); + s::cl_int r4 = r.w(); + + assert(r1 == 0); + assert(r2 == -1); + assert(r3 == 0); + assert(r4 == 0); + } + + // any. + // Call to the device function with vector parameters work. Scalars do not. + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::any(s::cl_int4{ -12, -12, 0, 1 }); + }); + }); + } + s::cl_int r1 = r; + + assert(r1 == 1); + } + + // any. + // Call to the device function with vector parameters work. Scalars do not. + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::any(s::cl_int4{ -12, -12, -12, -12 }); + }); + }); + } + s::cl_int r1 = r; + + assert(r1 == 1); + } + + // any. + // Call to the device function with vector parameters work. Scalars do not. + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::any(s::cl_int4{ 0, 0, 0, 0 }); + }); + }); + } + s::cl_int r1 = r; + + assert(r1 == 0); + } + + // any. + // Call to the device function with vector parameters work. Scalars do not. + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::any(s::cl_int4{ 12, 12, 12, 12 }); + }); + }); + } + s::cl_int r1 = r; + + assert(r1 == 0); + } + + // all. + // Call to the device function with vector parameters work. Scalars do not. + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::all(s::cl_int4{ -12, -12, -12, -12 }); + // Infinity (positive or negative) or Nan are not integers. + // Passing them creates inconsistent results between host and device + // execution. + }); + }); + } + s::cl_int r1 = r; + + assert(r1 == 1); + } + + // all. + // Call to the device function with vector parameters work. Scalars do not. + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::all(s::cl_int4{ -12, -12, -12, -12 }); + }); + }); + } + s::cl_int r1 = r; + + assert(r1 == 1); + } + + // all. + // Call to the device function with vector parameters work. Scalars do not. + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::all(s::cl_int4{ 0, 0, 0, 0 }); + }); + }); + } + s::cl_int r1 = r; + + assert(r1 == 0); + } + + // all. + // Call to the device function with vector parameters work. Scalars do not. + { + s::cl_int r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::all(s::cl_int4{ 12, 12, 12, 12 }); + }); + }); + } + s::cl_int r1 = r; + + assert(r1 == 0); + } + + // bitselect + { + s::cl_float4 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::bitselect(s::cl_float4{ 112.112, 12.12, 0, 0.0 }, + s::cl_float4{ 34.34, 23.23, 1, 0.0 }, + s::cl_float4{ 3.3, 6.6, 1, 0.0 }); + }); // Using NAN/INFINITY as any float produced consistent results + // between host and device. + }); + } + s::cl_float r1 = r.x(); + s::cl_float r2 = r.y(); + s::cl_float r3 = r.z(); + s::cl_float r4 = r.w(); + + assert(abs(r1 - 80.5477f) < 0.0001); + assert(abs(r2 - 18.2322f) < 0.0001); + assert(abs(r3 - 1.0f) < 0.01); + assert(abs(r4 - 0.0f) < 0.01); + } + + // select + { + s::cl_float4 r{ 0 }; + { + s::buffer BufR(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = + s::select(s::cl_float4{ 112.112f, 34.34f, 112.112f, 34.34f }, + s::cl_float4{ 34.34f, 112.112f, 34.34f, 112.112f }, + s::cl_int4{ 0, -1, 0, 1 }); + // Using NAN/infinity as an input, which gets + // selected by -1, produces a NAN/infinity as expected. + }); + }); + } + s::cl_float r1 = r.x(); + s::cl_float r2 = r.y(); + s::cl_float r3 = r.z(); + s::cl_float r4 = r.w(); + + assert(r1 == 112.112f); + assert(r2 == 112.112f); + assert(r3 == 112.112f); + assert(r4 == 34.34f); + } + + { + s::vec r(0); + { + s::vec a(1, 2, 3, 4); + s::vec b(5, 6, 7, 8); + s::vec m(1u, 0x80000000u, 42u, 0x80001000u); + s::buffer> A(&a, s::range<1>(1)); + s::buffer> B(&b, s::range<1>(1)); + s::buffer> M(&m, s::range<1>(1)); + s::buffer> R(&r, s::range<1>(1)); + s::queue myQueue; + myQueue.submit([&](s::handler &cgh) { + auto AccA = A.get_access(cgh); + auto AccB = B.get_access(cgh); + auto AccM = M.get_access(cgh); + auto AccR = R.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::select(AccA[0], AccB[0], AccM[0]); + }); + }); + } + if (r.x() != 1 || r.y() != 6 || r.z() != 3 || r.w() != 8) { + std::cerr << "selectI4I4U4 test case failed!\n"; + std::cerr << "Expected result: 1 6 3 8\n"; + std::cerr << "Got: " << r.x() << " " << r.y() << " " << r.z() << " " + << r.w() << "\n"; + return 1; + } + } + + return 0; +} diff --git a/SYCL/Basic/config/allowlist.cpp b/SYCL/Basic/config/allowlist.cpp new file mode 100644 index 0000000000..a56185c25a --- /dev/null +++ b/SYCL/Basic/config/allowlist.cpp @@ -0,0 +1,90 @@ +// REQUIRES: cpu +// RUN: %clangxx -fsycl %s -o %t.out +// +// RUN: env PRINT_DEVICE_INFO=1 %t.out > %t1.conf +// RUN: env TEST_DEVICE_AVAILABLE=1 env SYCL_CONFIG_FILE_NAME=%t1.conf %t.out +// +// RUN: env PRINT_PLATFORM_INFO=1 %t.out > %t2.conf +// RUN: env TEST_DEVICE_AVAILABLE=1 env SYCL_CONFIG_FILE_NAME=%t2.conf %t.out +// +// RUN: env TEST_DEVICE_IS_NOT_AVAILABLE=1 env SYCL_DEVICE_ALLOWLIST="PlatformName:{{SUCH NAME DOESN'T EXIST}}" %t.out + +#include +#include +#include +#include +#include + +using namespace cl; + +static void replaceSpecialCharacters(std::string &Str) { + // Replace common special symbols with '.' which matches to any character + std::replace_if(Str.begin(), Str.end(), + [](const char Sym) { return '(' == Sym || ')' == Sym; }, '.'); +} + +int main() { + + // Expected that the allowlist filter is not set + if (getenv("PRINT_PLATFORM_INFO")) { + for (const sycl::platform &Platform : sycl::platform::get_platforms()) + if (!Platform.is_host()) { + + std::string Name = Platform.get_info(); + std::string Ver = Platform.get_info(); + // As a string will be used as regexp pattern, we need to get rid of + // symbols that can be treated in a special way. + replaceSpecialCharacters(Name); + replaceSpecialCharacters(Ver); + + std::cout << "SYCL_DEVICE_ALLOWLIST=PlatformName:{{" << Name + << "}},PlatformVersion:{{" << Ver << "}}"; + + return 0; + } + throw std::runtime_error("Non host device is not found"); + } + + // Expected that the allowlist filter is not set + if (getenv("PRINT_DEVICE_INFO")) { + for (const sycl::platform &Platform : sycl::platform::get_platforms()) + if (!Platform.is_host()) { + const sycl::device Dev = Platform.get_devices().at(0); + std::string Name = Dev.get_info(); + std::string Ver = Dev.get_info(); + + // As a string will be used as regexp pattern, we need to get rid of + // symbols that can be treated in a special way. + replaceSpecialCharacters(Name); + replaceSpecialCharacters(Ver); + + std::cout << "SYCL_DEVICE_ALLOWLIST=DeviceName:{{" << Name + << "}},DriverVersion:{{" << Ver << "}}"; + + return 0; + } + throw std::runtime_error("Non host device is not found"); + } + + // Expected the allowlist to be set with the "PRINT_DEVICE_INFO" run result + if (getenv("TEST_DEVICE_AVAILABLE")) { + for (const sycl::platform &Platform : sycl::platform::get_platforms()) + if (!Platform.is_host()) { + if (Platform.get_devices().size() != 1) + throw std::runtime_error("Expected only one non host device."); + + return 0; + } + throw std::runtime_error("Non host device is not found"); + } + + // Expected the allowlist to be set but empty + if (getenv("TEST_DEVICE_IS_NOT_AVAILABLE")) { + for (const sycl::platform &Platform : sycl::platform::get_platforms()) + if (!Platform.is_host()) + throw std::runtime_error("Expected no non host device is available"); + return 0; + } + + throw std::runtime_error("Unhandled situation"); +} diff --git a/SYCL/Basic/config/config.cpp b/SYCL/Basic/config/config.cpp new file mode 100644 index 0000000000..d66e0392c6 --- /dev/null +++ b/SYCL/Basic/config/config.cpp @@ -0,0 +1,26 @@ +//==---- config.cpp --------------------------------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// RUN: %clangxx -g -O0 -fsycl %s -o %t.out +// RUN: echo "SYCL_PRINT_EXECUTION_GRAPH=always" > %t.cfg +// RUN: env SYCL_CONFIG_FILE_NAME=%t.cfg %t.out +// RUN: ls | grep dot +// RUN: rm *.dot +// RUN: env SYCL_PRINT_EXECUTION_GRAPH=always %t.out +// RUN: ls | grep dot +// RUN: rm *.dot +// RUN: %t.out +// RUN: ls | not grep dot + +#include + +using namespace cl; + +int main() { + sycl::buffer Buf(sycl::range<1>{1}); + auto Acc = Buf.get_access(); +} diff --git a/SYCL/Basic/device-code-split/Inputs/split-per-source-second-file.cpp b/SYCL/Basic/device-code-split/Inputs/split-per-source-second-file.cpp new file mode 100644 index 0000000000..daa2258763 --- /dev/null +++ b/SYCL/Basic/device-code-split/Inputs/split-per-source-second-file.cpp @@ -0,0 +1,21 @@ +#include "split-per-source.h" + +void runKernelsFromFile2() { + cl::sycl::queue Q; + int Data = 0; + { + cl::sycl::program Prg(Q.get_context()); + cl::sycl::buffer Buf(&Data, cl::sycl::range<1>(1)); + Prg.build_with_kernel_type(); + cl::sycl::kernel Krn = Prg.get_kernel(); + + assert(!Prg.has_kernel()); + assert(!Prg.has_kernel()); + + Q.submit([&](cl::sycl::handler &Cgh) { + auto Acc = Buf.get_access(Cgh); + Cgh.single_task(Krn, [=]() { Acc[0] = 3; }); + }); + } + assert(Data == 3); +} diff --git a/SYCL/Basic/device-code-split/Inputs/split-per-source.h b/SYCL/Basic/device-code-split/Inputs/split-per-source.h new file mode 100644 index 0000000000..fdb2dd4045 --- /dev/null +++ b/SYCL/Basic/device-code-split/Inputs/split-per-source.h @@ -0,0 +1,7 @@ +#include + +class File1Kern1; +class File1Kern2; +class File2Kern1; + +void runKernelsFromFile2(); diff --git a/SYCL/Basic/device-code-split/aot-accelerator.cpp b/SYCL/Basic/device-code-split/aot-accelerator.cpp new file mode 100644 index 0000000000..823c647ad1 --- /dev/null +++ b/SYCL/Basic/device-code-split/aot-accelerator.cpp @@ -0,0 +1,5 @@ +// REQUIRES: aoc, accelerator + +// RUN: %clangxx -fsycl -fsycl-device-code-split=per_source -fsycl-targets=spir64_fpga-unknown-unknown-sycldevice -I %S/Inputs -o %t.out %S/split-per-source-main.cpp %S/Inputs/split-per-source-second-file.cpp +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// XFAIL: * diff --git a/SYCL/Basic/device-code-split/aot-cpu.cpp b/SYCL/Basic/device-code-split/aot-cpu.cpp new file mode 100644 index 0000000000..78cd5df05d --- /dev/null +++ b/SYCL/Basic/device-code-split/aot-cpu.cpp @@ -0,0 +1,4 @@ +// REQUIRES: opencl-aot, cpu + +// RUN: %clangxx -fsycl -fsycl-device-code-split=per_source -fsycl-targets=spir64_x86_64-unknown-unknown-sycldevice -I %S/Inputs -o %t.out %S/split-per-source-main.cpp %S/Inputs/split-per-source-second-file.cpp +// RUN: %CPU_RUN_PLACEHOLDER %t.out diff --git a/SYCL/Basic/device-code-split/aot-gpu.cpp b/SYCL/Basic/device-code-split/aot-gpu.cpp new file mode 100644 index 0000000000..75c8aa15f6 --- /dev/null +++ b/SYCL/Basic/device-code-split/aot-gpu.cpp @@ -0,0 +1,11 @@ +// REQUIRES: ocloc, gpu +// UNSUPPORTED: cuda +// CUDA does neither support device code splitting nor SPIR. +// +// RUN: %clangxx -fsycl -fsycl-device-code-split=per_source \ +// RUN: -fsycl-targets=spir64_gen-unknown-unknown-sycldevice \ +// RUN: -Xsycl-target-backend=spir64_gen-unknown-unknown-sycldevice \ +// RUN: "-device skl" -I %S/Inputs -o %t.out \ +// RUN: %S/split-per-source-main.cpp \ +// RUN: %S/Inputs/split-per-source-second-file.cpp +// RUN: %GPU_RUN_PLACEHOLDER %t.out diff --git a/SYCL/Basic/device-code-split/split-per-kernel.cpp b/SYCL/Basic/device-code-split/split-per-kernel.cpp new file mode 100644 index 0000000000..f63e521b87 --- /dev/null +++ b/SYCL/Basic/device-code-split/split-per-kernel.cpp @@ -0,0 +1,68 @@ +// UNSUPPORTED: cuda +// CUDA does not support device code splitting. +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-device-code-split=per_kernel -o %t.out %s +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// REQUIRES: cpu, gpu, accelerator + +#include + +class Kern1; +class Kern2; +class Kern3; + +int main() { + cl::sycl::queue Q; + int Data = 0; + { + cl::sycl::buffer Buf(&Data, cl::sycl::range<1>(1)); + cl::sycl::program Prg(Q.get_context()); + Prg.build_with_kernel_type(); + cl::sycl::kernel Krn = Prg.get_kernel(); + + assert(!Prg.has_kernel()); + assert(!Prg.has_kernel()); + + Q.submit([&](cl::sycl::handler &Cgh) { + auto Acc = Buf.get_access(Cgh); + Cgh.single_task(Krn, [=]() { Acc[0] = 1; }); + }); + } + assert(Data == 1); + + { + cl::sycl::buffer Buf(&Data, cl::sycl::range<1>(1)); + cl::sycl::program Prg(Q.get_context()); + Prg.build_with_kernel_type(); + cl::sycl::kernel Krn = Prg.get_kernel(); + + assert(!Prg.has_kernel()); + assert(!Prg.has_kernel()); + + Q.submit([&](cl::sycl::handler &Cgh) { + auto Acc = Buf.get_access(Cgh); + Cgh.single_task(Krn, [=]() { Acc[0] = 2; }); + }); + } + assert(Data == 2); + + { + cl::sycl::buffer Buf(&Data, cl::sycl::range<1>(1)); + cl::sycl::program Prg(Q.get_context()); + Prg.build_with_kernel_type(); + cl::sycl::kernel Krn = Prg.get_kernel(); + + assert(!Prg.has_kernel()); + assert(!Prg.has_kernel()); + + Q.submit([&](cl::sycl::handler &Cgh) { + auto Acc = Buf.get_access(Cgh); + Cgh.single_task(Krn, [=]() { Acc[0] = 3; }); + }); + } + assert(Data == 3); + + return 0; +} diff --git a/SYCL/Basic/device-code-split/split-per-source-main.cpp b/SYCL/Basic/device-code-split/split-per-source-main.cpp new file mode 100644 index 0000000000..e418451550 --- /dev/null +++ b/SYCL/Basic/device-code-split/split-per-source-main.cpp @@ -0,0 +1,54 @@ +// UNSUPPORTED: cuda +// CUDA does not support device code splitting. +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-device-code-split=per_source -I %S/Inputs -o %t.out %s %S/Inputs/split-per-source-second-file.cpp +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// REQUIRES: cpu, gpu, accelerator + +#include "Inputs/split-per-source.h" + +int main () { + cl::sycl::queue Q; + int Data = 0; + { + cl::sycl::buffer Buf(&Data, cl::sycl::range<1>(1)); + cl::sycl::program Prg(Q.get_context()); + Prg.build_with_kernel_type(); + cl::sycl::kernel Krn = Prg.get_kernel(); + + assert(Prg.has_kernel()); + // TODO uncomment once the KernelInfo in multiple translation units + // bug is fixed. + // assert(!Prg.has_kernel()); + + Q.submit([&](cl::sycl::handler &Cgh) { + auto Acc = Buf.get_access(Cgh); + Cgh.single_task(/*Krn,*/ [=]() { Acc[0] = 1; }); + }); + } + assert(Data == 1); + + { + cl::sycl::buffer Buf(&Data, cl::sycl::range<1>(1)); + cl::sycl::program Prg(Q.get_context()); + Prg.build_with_kernel_type(); + cl::sycl::kernel Krn = Prg.get_kernel(); + + assert(Prg.has_kernel()); + // TODO uncomment once the KernelInfo in multiple translation units + // bug is fixed. + // assert(!Prg.has_kernel()); + + Q.submit([&](cl::sycl::handler &Cgh) { + auto Acc = Buf.get_access(Cgh); + Cgh.single_task(/*Krn,*/ [=]() { Acc[0] = 2; }); + }); + } + assert(Data == 2); + + runKernelsFromFile2(); + + return 0; +} diff --git a/SYCL/Basic/devicelib/assert-windows.cpp b/SYCL/Basic/devicelib/assert-windows.cpp new file mode 100644 index 0000000000..67f8830523 --- /dev/null +++ b/SYCL/Basic/devicelib/assert-windows.cpp @@ -0,0 +1,75 @@ +// REQUIRES: cpu,windows +// +// RUN: %clangxx -fsycl -c %s -o %t.o +// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/../bin/libsycl-msvc.o -o %t.out +// +// MSVC implementation of assert does not call an unreachable built-in, so the +// program doesn't terminate when fallback is used. +// +// FIXME: SPIR-V Unreachable should be called from the fallback +// explicitly. Since the test is going to crash, we'll have to follow a similar +// approach as on Linux - call the test in a subprocess. +// +// RUN: env SYCL_PI_TRACE=1 SYCL_DEVICELIB_INHIBIT_NATIVE=1 CL_CONFIG_USE_VECTORIZER=False SYCL_DEVICE_TYPE=CPU %t.out >%t.stdout.pi.fallback +// RUN: env SHOULD_CRASH=1 SYCL_DEVICELIB_INHIBIT_NATIVE=1 CL_CONFIG_USE_VECTORIZER=False SYCL_DEVICE_TYPE=CPU %t.out >%t.stdout.msg.fallback +// +// RUN: FileCheck %s --check-prefix=CHECK-MESSAGE --input-file %t.stdout.msg.fallback +// CHECK-MESSAGE: {{.*}}assert-windows.cpp:{{[0-9]+}}: (null): global id: [{{[0-3]}},0,0], local id: [{{[0-3]}},0,0] Assertion `accessorC[wiID] == 0 && "Invalid value"` failed. +// +// RUN: FileCheck %s --input-file %t.stdout.pi.fallback --check-prefix=CHECK-FALLBACK +// CHECK-FALLBACK: ---> piProgramLink + +#include +#include +#include + +using namespace cl::sycl; + +constexpr auto sycl_read = cl::sycl::access::mode::read; +constexpr auto sycl_write = cl::sycl::access::mode::write; + +template +void simple_vadd(const std::array &VA, const std::array &VB, + std::array &VC) { + queue deviceQueue([](cl::sycl::exception_list ExceptionList) { + for (cl::sycl::exception_ptr_class ExceptionPtr : ExceptionList) { + try { + std::rethrow_exception(ExceptionPtr); + } catch (cl::sycl::exception &E) { + std::cerr << E.what() << std::endl; + } catch (...) { + std::cerr << "Unknown async exception was caught." << std::endl; + } + } + }); + + int shouldCrash = getenv("SHOULD_CRASH") ? 1 : 0; + + cl::sycl::range<1> numOfItems{N}; + cl::sycl::buffer bufferA(VA.data(), numOfItems); + cl::sycl::buffer bufferB(VB.data(), numOfItems); + cl::sycl::buffer bufferC(VC.data(), numOfItems); + + deviceQueue.submit([&](cl::sycl::handler &cgh) { + auto accessorA = bufferA.template get_access(cgh); + auto accessorB = bufferB.template get_access(cgh); + auto accessorC = bufferC.template get_access(cgh); + + cgh.parallel_for(numOfItems, [=](cl::sycl::id<1> wiID) { + accessorC[wiID] = accessorA[wiID] + accessorB[wiID]; + if (shouldCrash) { + assert(accessorC[wiID] == 0 && "Invalid value"); + } + }); + }); + deviceQueue.wait_and_throw(); +} + +int main() { + std::array A = {1, 2, 3}; + std::array B = {1, 2, 3}; + std::array C = {0, 0, 0}; + + simple_vadd(A, B, C); + return EXIT_SUCCESS; +} diff --git a/SYCL/Basic/devicelib/assert.cpp b/SYCL/Basic/devicelib/assert.cpp new file mode 100644 index 0000000000..343d949b74 --- /dev/null +++ b/SYCL/Basic/devicelib/assert.cpp @@ -0,0 +1,215 @@ +// REQUIRES: cpu,linux +// RUN: %clangxx -fsycl -c %s -o %t.o +// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/libsycl-glibc.o -o %t.out +// (see the other RUN lines below; it is a bit complicated) +// +// assert() call in device code guarantees nothing: on some devices it behaves +// in the usual way and terminates the program. On other devices it can print an +// error message and *continue* execution. Less capable devices can even ignore +// an assert! +// +// This makes testing an assert() a bit difficult task, and we have to rely on +// the implementation details to make sure that both "native" and "fallback" +// implementations work as expected. +// +// This test works only on Intel OpenCL CPU implementation, which is known to +// behave as follows: +// +// Fallback mode (aka the best we can do by following the OpenCL spec): +// 1. Assertion condition is printed to *stdout* by the OpenCL printf(). +// 2. Process (both host and device) is terminated by a SIGSEGV. +// +// Native mode (same behavior as libc assert on CPU): +// 1. Assertion condition is printed to *stderr*. +// 2. Process (both host and device) is terminated by a SIGABRT. +// +// Other devices are "covered" by the assert-dummy.cpp test, which doesn't +// verify anything except a successful compilation for a device. +// +// FIXME: assert-dummy.cpp is not implemented yet, so other devices are not +// covered. +// +// How the test works: +// ------------------- +// +// 1. First we verify that a call sequence in SYCL Runtime is correct: +// +// - in the fallback mode we have to link an additional library that +// provides a generic implementation of assert(). +// +// - in the native mode we don't link anything, and call clBuildProgram for +// a user program alone. +// +// 2. Then we test that there is actually a difference between the two +// modes. Since the CPU device is the only device that supports this +// extension natively, we catch the difference between the fallback and the +// native modes: SIGSEGV should occur in the fallback mode, SIGABRT in the +// native mode. +// +// In order to check the signal we fork() and let the child die. Then we +// verify how it was terminated. EXPECTED_SIGNAL environment variable +// controls the expected result. +// +// 3. We also test that a message is printed to the corresponding fd: stdout +// for the fallback mode and stderr for the native mode. In the fallback +// mode the test process dies right after a call to the OpenCL printf(), so +// the message can still be buffered by stdio. We turn the bufferization +// off explicitly. +// +// 4. We want to check both compilation flow in (1) and the message in (3), +// but these messages can interleave and fail to match. To avoid this, +// first run with SYCL_PI_TRACE and collect a trace, and then with +// SHOULD_CRASH (without SYCL_PI_TRACE) to collect an error message. +// +// SYCL_DEVICELIB_INHIBIT_NATIVE=1 environment variable is used to force a mode +// in SYCL Runtime, so it doesn't look into a device extensions list and always +// link the fallback library. +// +// +// We also skip the native test entirely (see SKIP_IF_NO_EXT), since the assert +// extension is a new feature and may not be supported by the runtime used with +// SYCL. +// +// Overall this sounds stable enough. What could possibly go wrong? +// +// RUN: env SYCL_PI_TRACE=2 SHOULD_CRASH=1 SYCL_DEVICE_TYPE=CPU EXPECTED_SIGNAL=SIGABRT SKIP_IF_NO_EXT=1 %t.out 2>%t.stderr.native >%t.stdout.native +// RUN: FileCheck %s --input-file %t.stdout.native --check-prefixes=CHECK-NATIVE || FileCheck %s --input-file %t.stderr.native --check-prefix CHECK-NOTSUPPORTED +// RUN: FileCheck %s --input-file %t.stderr.native --check-prefixes=CHECK-MESSAGE || FileCheck %s --input-file %t.stderr.native --check-prefix CHECK-NOTSUPPORTED +// +// RUN: env SYCL_PI_TRACE=2 SYCL_DEVICELIB_INHIBIT_NATIVE=cl_intel_devicelib_assert SYCL_DEVICE_TYPE=CPU %t.out >%t.stdout.pi.fallback +// RUN: env SYCL_DEVICELIB_INHIBIT_NATIVE=cl_intel_devicelib_assert SYCL_DEVICE_TYPE=CPU %t.out >%t.stdout.msg.fallback +// RUN: FileCheck %s --input-file %t.stdout.pi.fallback --check-prefixes=CHECK-FALLBACK +// RUN: FileCheck %s --input-file %t.stdout.msg.fallback --check-prefixes=CHECK-MESSAGE +// +// CHECK-NATIVE: ---> piProgramBuild +// CHECK-FALLBACK: ---> piProgramLink +// +// Skip the test if the CPU RT doesn't support the extension yet: +// CHECK-NOTSUPPORTED: Device has no support for cl_intel_devicelib_assert +// +// Anyway, the same message has to be printed for both the fallback and the +// native modes (fallback prints to stdout, while native prints to stderr; we +// already handled this difference in the RUN lines): +// +// CHECK-MESSAGE: {{.*}}assert.cpp:{{[0-9]+}}: auto simple_vadd(const std::array &, const std::array &, std::array &)::(anonymous class)::operator()(cl::sycl::handler &)::(anonymous class)::operator()(cl::sycl::id<1>) const: global id: [{{[0-3]}},0,0], local id: [{{[0-3]}},0,0] Assertion `accessorC[wiID] == 0 && "Invalid value"` failed. +// +// Note that the work-item that hits the assert first may vary, since the order +// of execution is undefined. We catch only the first one (whatever id it is). + +#include +#include +#include +#include +#include +#include +#include + +using namespace cl::sycl; + +constexpr auto sycl_read = cl::sycl::access::mode::read; +constexpr auto sycl_write = cl::sycl::access::mode::write; + +const int EXIT_SKIP_TEST = 42; + +template +void simple_vadd(const std::array &VA, const std::array &VB, + std::array &VC) { + queue deviceQueue([](cl::sycl::exception_list ExceptionList) { + for (cl::sycl::exception_ptr_class ExceptionPtr : ExceptionList) { + try { + std::rethrow_exception(ExceptionPtr); + } catch (cl::sycl::exception &E) { + std::cerr << E.what() << std::endl; + } catch (...) { + std::cerr << "Unknown async exception was caught." << std::endl; + } + } + }); + device dev = deviceQueue.get_device(); + bool unsupported = true; + for (auto &ext : dev.get_info()) { + if (ext == "cl_intel_devicelib_assert") { + unsupported = false; + } + } + if (unsupported && getenv("SKIP_IF_NO_EXT")) { + fprintf(stderr, "Device has no support for cl_intel_devicelib_assert, " + "skipping the test\n"); + exit(EXIT_SKIP_TEST); + } + + + cl::sycl::range<1> numOfItems{N}; + cl::sycl::buffer bufferA(VA.data(), numOfItems); + cl::sycl::buffer bufferB(VB.data(), numOfItems); + cl::sycl::buffer bufferC(VC.data(), numOfItems); + + deviceQueue.submit([&](cl::sycl::handler &cgh) { + auto accessorA = bufferA.template get_access(cgh); + auto accessorB = bufferB.template get_access(cgh); + auto accessorC = bufferC.template get_access(cgh); + + cgh.parallel_for(numOfItems, [=](cl::sycl::id<1> wiID) { + accessorC[wiID] = accessorA[wiID] + accessorB[wiID]; + assert(accessorC[wiID] == 0 && "Invalid value"); + }); + }); + deviceQueue.wait_and_throw(); +} + +int main() { + int child = fork(); + if (child) { + int status = 0; + waitpid(child, &status, 0); + if (WIFEXITED(status) && WEXITSTATUS(status) == EXIT_SKIP_TEST) { + return 0; + } + if (getenv("SHOULD_CRASH")) { + if (!WIFSIGNALED(status)) { + fprintf(stderr, "error: process did not terminate by a signal\n"); + return 1; + } + } else { + if (WIFSIGNALED(status)) { + fprintf(stderr, "error: process should not terminate\n"); + return 1; + } + // We should not check anything if the child finished successful and this + // was expected. + return 0; + } + int sig = WTERMSIG(status); + int expected = 0; + if (const char *env = getenv("EXPECTED_SIGNAL")) { + if (0 == strcmp(env, "SIGABRT")) { + expected = SIGABRT; + } else if (0 == strcmp(env, "SIGSEGV")) { + expected = SIGSEGV; + } + if (!expected) { + fprintf(stderr, "EXPECTED_SIGNAL should be set to either \"SIGABRT\", " + "or \"SIGSEGV\"!\n"); + return 1; + } + } + if (sig != expected) { + fprintf(stderr, "error: expected signal %d, got %d\n", expected, sig); + return 1; + } + return 0; + } + + // Turn the bufferization off to not loose the assert message if it is written + // to stdout. + if (setvbuf(stdout, NULL, _IONBF, 0)) { + perror("failed to turn off bufferization on stdout"); + return 1; + } + + std::array A = {1, 2, 3}; + std::array B = {1, 2, 3}; + std::array C = {0, 0, 0}; + + simple_vadd(A, B, C); +} diff --git a/SYCL/Basic/devicelib/c99_complex_math_fp64_test.cpp b/SYCL/Basic/devicelib/c99_complex_math_fp64_test.cpp new file mode 100644 index 0000000000..c039025b11 --- /dev/null +++ b/SYCL/Basic/devicelib/c99_complex_math_fp64_test.cpp @@ -0,0 +1,256 @@ +// UNSUPPORTED: windows +// RUN: %clangxx -fsycl -c %s -o %t.o +// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/libsycl-complex-fp64.o -o %t.out +#include +#include +#include +#include "math_utils.hpp" +#ifndef CMPLX +#define CMPLX(r, i) ((double __complex__){ (double)r, (double)i }) +#endif + +bool approx_equal_c99_cmplx(double __complex__ x, double __complex__ y) { + return approx_equal_fp(creal(x), creal(y)) && approx_equal_fp(cimag(x), cimag(y)); +} + +namespace s = cl::sycl; +constexpr s::access::mode sycl_read = s::access::mode::read; +constexpr s::access::mode sycl_write = s::access::mode::write; + +class DeviceComplexTimes; + +void device_c99_complex_times(s::queue &deviceQueue) { + double __complex__ buf_in3[4] = {CMPLX(0, 1), CMPLX(1, 1), + CMPLX(2, 3), CMPLX(4, 5)}; + double __complex__ buf_in4[4] = {CMPLX(1, 1), CMPLX(2, 1), + CMPLX(2, 2), CMPLX(3, 4)}; + double __complex__ buf_out2[4]; + + double __complex__ ref_results2[4] = {CMPLX(-1, 1), CMPLX(1, 3), + CMPLX(-2, 10), CMPLX(-8, 31)}; + s::range<1> numOfItems{4}; + { + s::buffer buffer4(buf_in3, numOfItems); + s::buffer buffer5(buf_in4, numOfItems); + s::buffer buffer6(buf_out2, numOfItems); + deviceQueue.submit([&](s::handler &cgh) { + auto buf_in3_access = buffer4.get_access(cgh); + auto buf_in4_access = buffer5.get_access(cgh); + auto buf_out2_access = buffer6.get_access(cgh); + cgh.parallel_for(numOfItems, [=](s::id<1>WIid) { + buf_out2_access[WIid] = buf_in3_access[WIid] * buf_in4_access[WIid]; + }); + }); + } + + for (size_t idx = 0; idx < 4; ++idx) { + assert(approx_equal_c99_cmplx(buf_out2[idx], ref_results2[idx])); + } +} + +class DeviceComplexDivides; + +void device_c99_complex_divides(s::queue &deviceQueue) { + double __complex__ buf_in3[8] = {CMPLX(-1, 1), CMPLX(1, 3), + CMPLX(-2, 10), CMPLX(-8, 31), + CMPLX(4, 2), CMPLX(-1, 0), + CMPLX(0, 10), CMPLX(0 , 0)}; + double __complex__ buf_in4[8] = {CMPLX(0, 1), CMPLX(1, 1), + CMPLX(2, 3), CMPLX(4, 5), + CMPLX(2, 0), CMPLX(0, 1), + CMPLX(0, 5), CMPLX(1, 0)}; + double __complex__ ref_results2[8] = {CMPLX(1, 1), CMPLX(2, 1), + CMPLX(2, 2), CMPLX(3, 4), + CMPLX(2, 1), CMPLX(0, 1), + CMPLX(2, 0), CMPLX(0, 0)}; + double __complex__ buf_out2[8]; + + s::range<1> numOfItems{8}; + { + s::buffer buffer4(buf_in3, numOfItems); + s::buffer buffer5(buf_in4, numOfItems); + s::buffer buffer6(buf_out2, numOfItems); + deviceQueue.submit([&](s::handler &cgh) { + auto buf_in3_access = buffer4.get_access(cgh); + auto buf_in4_access = buffer5.get_access(cgh); + auto buf_out2_access = buffer6.get_access(cgh); + cgh.parallel_for(numOfItems, [=](s::id<1>WIid) { + buf_out2_access[WIid] = buf_in3_access[WIid] / buf_in4_access[WIid]; + }); + }); + } + + for (size_t idx = 0; idx < 8; ++idx) { + assert(approx_equal_c99_cmplx(buf_out2[idx], ref_results2[idx])); + } +} + +class DeviceComplexSqrt; + +void device_c99_complex_sqrt(s::queue &deviceQueue) { + double __complex__ buf_in2[4] = {CMPLX(-1, 0), CMPLX(0, 2), + CMPLX(4, 0), CMPLX(-5, 12)}; + double __complex__ buf_out2[4]; + double __complex__ ref_results2[4] = {CMPLX(0, 1), CMPLX(1, 1), + CMPLX(2, 0), CMPLX(2, 3)}; + s::range<1> numOfItems{4}; + { + s::buffer buffer3(buf_in2, numOfItems); + s::buffer buffer4(buf_out2, numOfItems); + deviceQueue.submit([&](s::handler &cgh) { + auto buf_in2_access = buffer3.get_access(cgh); + auto buf_out2_access = buffer4.get_access(cgh); + cgh.parallel_for(numOfItems, [=](s::id<1>WIid) { + buf_out2_access[WIid] = csqrt(buf_in2_access[WIid]); + }); + }); + } + + for (size_t idx = 0; idx < 4; ++idx) { + assert(approx_equal_c99_cmplx(buf_out2[idx], ref_results2[idx])); + } +} + +class DeviceComplexAbs; + +void device_c99_complex_abs(s::queue &deviceQueue) { + double __complex__ buf_in2[4] = {CMPLX(0, 0), CMPLX(3, 4), + CMPLX(12, 5), CMPLX(INFINITY, 1)}; + double buf_out2[4]; + double ref_results2[4] = {0, 5, 13, INFINITY}; + s::range<1> numOfItems{4}; + { + s::buffer buffer3(buf_in2, numOfItems); + s::buffer buffer4(buf_out2, numOfItems); + deviceQueue.submit([&](s::handler &cgh) { + auto buf_in2_access = buffer3.get_access(cgh); + auto buf_out2_access = buffer4.get_access(cgh); + cgh.parallel_for(numOfItems, [=](s::id<1>WIid) { + buf_out2_access[WIid] = cabs(buf_in2_access[WIid]); + }); + }); + } + + for (size_t idx = 0; idx < 4; ++idx) { + assert(approx_equal_fp(buf_out2[idx], ref_results2[idx])); + } +} + +class DeviceComplexExp; + +void device_c99_complex_exp(s::queue &deviceQueue) { + double __complex__ buf_in2[4] = {CMPLX(0, 0), CMPLX(0, M_PI_2), + CMPLX(0, M_PI), CMPLX(1, M_PI_2)}; + double __complex__ buf_out2[4]; + double __complex__ ref_results2[4] = {CMPLX(1, 0), CMPLX(0, 1), + CMPLX(-1, 0),CMPLX(0, M_E)}; + s::range<1> numOfItems{4}; + { + s::buffer buffer3(buf_in2, numOfItems); + s::buffer buffer4(buf_out2, numOfItems); + deviceQueue.submit([&](s::handler &cgh) { + auto buf_in2_access = buffer3.get_access(cgh); + auto buf_out2_access = buffer4.get_access(cgh); + cgh.parallel_for(numOfItems, [=](s::id<1>WIid) { + buf_out2_access[WIid] = cexp(buf_in2_access[WIid]); + }); + }); + } + + for (size_t idx = 0; idx < 4; ++idx) { + assert(approx_equal_c99_cmplx(buf_out2[idx], ref_results2[idx])); + } +} + +class DeviceComplexLog; + +void device_c99_complex_log(s::queue &deviceQueue) { + double __complex__ buf_in2[4] = {CMPLX(1, 0), CMPLX(0, 1), + CMPLX(-1, 0), CMPLX(0, M_E)}; + double __complex__ buf_out2[4]; + double __complex__ ref_results2[4] = {CMPLX(0, 0), CMPLX(0, M_PI_2), + CMPLX(0, M_PI), CMPLX(1, M_PI_2)}; + s::range<1> numOfItems{4}; + { + s::buffer buffer3(buf_in2, numOfItems); + s::buffer buffer4(buf_out2, numOfItems); + deviceQueue.submit([&](s::handler &cgh) { + auto buf_in2_access = buffer3.get_access(cgh); + auto buf_out2_access = buffer4.get_access(cgh); + cgh.parallel_for(numOfItems, [=](s::id<1>WIid) { + buf_out2_access[WIid] = ::clog(buf_in2_access[WIid]); + }); + }); + } + + for (size_t idx = 0; idx < 4; ++idx) { + assert(approx_equal_c99_cmplx(buf_out2[idx], ref_results2[idx])); + } +} + +class DeviceComplexSin; + +void device_c99_complex_sin(s::queue &deviceQueue) { + double __complex__ buf_in2[2] = {CMPLX(0, 0), CMPLX(M_PI_2, 0)}; + double __complex__ buf_out2[2]; + double __complex__ ref_results2[2] = {CMPLX(0, 0), CMPLX(1, 0)}; + s::range<1> numOfItems{2}; + { + s::buffer buffer3(buf_in2, numOfItems); + s::buffer buffer4(buf_out2, numOfItems); + deviceQueue.submit([&](s::handler &cgh) { + auto buf_in2_access = buffer3.get_access(cgh); + auto buf_out2_access = buffer4.get_access(cgh); + cgh.parallel_for(numOfItems, [=](s::id<1>WIid) { + buf_out2_access[WIid] = csin(buf_in2_access[WIid]); + }); + }); + } + + for (size_t idx = 0; idx < 2; ++idx) { + assert(approx_equal_c99_cmplx(buf_out2[idx], ref_results2[idx])); + } +} + +class DeviceComplexCos; + +void device_c99_complex_cos(s::queue &deviceQueue) { + double __complex__ buf_in2[2] = {CMPLX(0, 0), CMPLX(M_PI, 0)}; + double __complex__ buf_out2[2]; + double __complex__ ref_results2[2] = {CMPLX(1, 0), CMPLX(-1, 0)}; + s::range<1> numOfItems{2}; + { + s::buffer buffer3(buf_in2, numOfItems); + s::buffer buffer4(buf_out2, numOfItems); + deviceQueue.submit([&](s::handler &cgh) { + auto buf_in2_access = buffer3.get_access(cgh); + auto buf_out2_access = buffer4.get_access(cgh); + cgh.parallel_for(numOfItems, [=](s::id<1>WIid) { + buf_out2_access[WIid] = ccos(buf_in2_access[WIid]); + }); + }); + } + + for (size_t idx = 0; idx < 2; ++idx) { + assert(approx_equal_c99_cmplx(buf_out2[idx], ref_results2[idx])); + } +} + +void device_c99_complex_test(s::queue &deviceQueue) { + device_c99_complex_times(deviceQueue); + device_c99_complex_divides(deviceQueue); + device_c99_complex_sqrt(deviceQueue); + device_c99_complex_abs(deviceQueue); + device_c99_complex_exp(deviceQueue); + device_c99_complex_log(deviceQueue); + device_c99_complex_sin(deviceQueue); + device_c99_complex_cos(deviceQueue); +} + +int main() { + s::queue deviceQueue; + if (deviceQueue.get_device().has_extension("cl_khr_fp64")) { + device_c99_complex_test(deviceQueue); + std::cout << "Pass" << std::endl; + } +} diff --git a/SYCL/Basic/devicelib/c99_complex_math_test.cpp b/SYCL/Basic/devicelib/c99_complex_math_test.cpp new file mode 100644 index 0000000000..704d80bd01 --- /dev/null +++ b/SYCL/Basic/devicelib/c99_complex_math_test.cpp @@ -0,0 +1,258 @@ +// UNSUPPORTED: windows +// RUN: %clangxx -fsycl -c %s -o %t.o +// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/libsycl-complex.o -o %t.out +#include +#include +#include +#include "math_utils.hpp" + +#ifndef CMPLXF +#define CMPLXF(r, i) ((float __complex__){ (float)r, (float)i }) +#endif + +bool approx_equal_c99_cmplxf(float __complex__ x, float __complex__ y) { + return approx_equal_fp(crealf(x), crealf(y)) && approx_equal_fp(cimagf(x), cimagf(y)); +} + +namespace s = cl::sycl; +constexpr s::access::mode sycl_read = s::access::mode::read; +constexpr s::access::mode sycl_write = s::access::mode::write; + +class DeviceComplexTimes; + +void device_c99_complex_times(s::queue &deviceQueue) { + float __complex__ buf_in1[4] = {CMPLXF(0, 1), CMPLXF(1, 1), + CMPLXF(2, 3), CMPLXF(4, 5)}; + float __complex__ buf_in2[4] = {CMPLXF(1, 1), CMPLXF(2, 1), + CMPLXF(2, 2), CMPLXF(3, 4)}; + float __complex__ buf_out1[4]; + + float __complex__ ref_results1[4] = {CMPLXF(-1, 1), CMPLXF(1, 3), + CMPLXF(-2, 10), CMPLXF(-8, 31)}; + + s::range<1> numOfItems{4}; + { + s::buffer buffer1(buf_in1, numOfItems); + s::buffer buffer2(buf_in2, numOfItems); + s::buffer buffer3(buf_out1, numOfItems); + deviceQueue.submit([&](s::handler &cgh) { + auto buf_in1_access = buffer1.get_access(cgh); + auto buf_in2_access = buffer2.get_access(cgh); + auto buf_out1_access = buffer3.get_access(cgh); + cgh.parallel_for(numOfItems, [=](s::id<1>WIid) { + buf_out1_access[WIid] = buf_in1_access[WIid] * buf_in2_access[WIid]; + }); + }); + } + + for (size_t idx = 0; idx < 4; ++idx) { + assert(approx_equal_c99_cmplxf(buf_out1[idx], ref_results1[idx])); + } +} + +class DeviceComplexDivides; + +void device_c99_complex_divides(s::queue &deviceQueue) { + float __complex__ buf_in1[8] = {CMPLXF(-1, 1), CMPLXF(1, 3), + CMPLXF(-2, 10), CMPLXF(-8, 31), + CMPLXF(4, 2), CMPLXF(-1, 0), + CMPLXF(0, 10), CMPLXF(0 , 0)}; + float __complex__ buf_in2[8] = {CMPLXF(0, 1), CMPLXF(1, 1), + CMPLXF(2, 3), CMPLXF(4, 5), + CMPLXF(2, 0), CMPLXF(0, 1), + CMPLXF(0, 5), CMPLXF(1, 0)}; + float __complex__ ref_results1[8] = {CMPLXF(1, 1), CMPLXF(2, 1), + CMPLXF(2, 2), CMPLXF(3, 4), + CMPLXF(2, 1), CMPLXF(0, 1), + CMPLXF(2, 0), CMPLXF(0, 0)}; + float __complex__ buf_out1[8]; + + s::range<1> numOfItems{8}; + { + s::buffer buffer1(buf_in1, numOfItems); + s::buffer buffer2(buf_in2, numOfItems); + s::buffer buffer3(buf_out1,numOfItems); + deviceQueue.submit([&](s::handler &cgh) { + auto buf_in1_access = buffer1.get_access(cgh); + auto buf_in2_access = buffer2.get_access(cgh); + auto buf_out1_access = buffer3.get_access(cgh); + cgh.parallel_for(numOfItems, [=](s::id<1>WIid) { + buf_out1_access[WIid] = buf_in1_access[WIid] / buf_in2_access[WIid]; + }); + }); + } + + for (size_t idx = 0; idx < 8; ++idx) { + assert(approx_equal_c99_cmplxf(buf_out1[idx], ref_results1[idx])); + } +} + +class DeviceComplexSqrt; + +void device_c99_complex_sqrt(s::queue &deviceQueue) { + float __complex__ buf_in1[4] = {CMPLXF(-1, 0), CMPLXF(0, 2), + CMPLXF(4, 0), CMPLXF(-5, 12)}; + float __complex__ buf_out1[4]; + float __complex__ ref_results1[4] = {CMPLXF(0, 1), CMPLXF(1, 1), + CMPLXF(2, 0), CMPLXF(2, 3)}; + + s::range<1> numOfItems{4}; + { + s::buffer buffer1(buf_in1, numOfItems); + s::buffer buffer2(buf_out1, numOfItems); + deviceQueue.submit([&](s::handler &cgh) { + auto buf_in1_access = buffer1.get_access(cgh); + auto buf_out1_access = buffer2.get_access(cgh); + cgh.parallel_for(numOfItems, [=](s::id<1>WIid) { + buf_out1_access[WIid] = csqrtf(buf_in1_access[WIid]); + }); + }); + } + + for (size_t idx = 0; idx < 4; ++idx) { + assert(approx_equal_c99_cmplxf(buf_out1[idx], ref_results1[idx])); + } +} + +class DeviceComplexAbs; + +void device_c99_complex_abs(s::queue &deviceQueue) { + float __complex__ buf_in1[4] = {CMPLXF(0, 0), CMPLXF(3, 4), + CMPLXF(12, 5), CMPLXF(INFINITY, 1)}; + float buf_out1[4]; + float ref_results1[4] = {0, 5, 13, INFINITY}; + + s::range<1> numOfItems{4}; + { + s::buffer buffer1(buf_in1, numOfItems); + s::buffer buffer2(buf_out1, numOfItems); + deviceQueue.submit([&](s::handler &cgh) { + auto buf_in1_access = buffer1.get_access(cgh); + auto buf_out1_access = buffer2.get_access(cgh); + cgh.parallel_for(numOfItems, [=](s::id<1>WIid) { + buf_out1_access[WIid] = cabsf(buf_in1_access[WIid]); + }); + }); + } + + for (size_t idx = 0; idx < 4; ++idx) { + assert(approx_equal_fp(buf_out1[idx], ref_results1[idx])); + } +} + +class DeviceComplexExp; + +void device_c99_complex_exp(s::queue &deviceQueue) { + float __complex__ buf_in1[4] = {CMPLXF(0, 0), CMPLXF(0, M_PI_2), + CMPLXF(0, M_PI), CMPLXF(1, M_PI_2)}; + float __complex__ buf_out1[4]; + float __complex__ ref_results1[4] = {CMPLXF(1, 0), CMPLXF(0, 1), + CMPLXF(-1, 0),CMPLXF(0, M_E)}; + s::range<1> numOfItems{4}; + { + s::buffer buffer1(buf_in1, numOfItems); + s::buffer buffer2(buf_out1, numOfItems); + deviceQueue.submit([&](s::handler &cgh) { + auto buf_in1_access = buffer1.get_access(cgh); + auto buf_out1_access = buffer2.get_access(cgh); + cgh.parallel_for(numOfItems, [=](s::id<1>WIid) { + buf_out1_access[WIid] = cexpf(buf_in1_access[WIid]); + }); + }); + } + + for (size_t idx = 0; idx < 4; ++idx) { + assert(approx_equal_c99_cmplxf(buf_out1[idx], ref_results1[idx])); + } +} + +class DeviceComplexLog; + +void device_c99_complex_log(s::queue &deviceQueue) { + float __complex__ buf_in1[4] = {CMPLXF(1, 0), CMPLXF(0, 1), + CMPLXF(-1, 0), CMPLXF(0, M_E)}; + float __complex__ buf_out1[4]; + float __complex__ ref_results1[4] = {CMPLXF(0, 0), CMPLXF(0, M_PI_2), + CMPLXF(0, M_PI), CMPLXF(1, M_PI_2)}; + s::range<1> numOfItems{4}; + { + s::buffer buffer1(buf_in1, numOfItems); + s::buffer buffer2(buf_out1, numOfItems); + deviceQueue.submit([&](s::handler &cgh) { + auto buf_in1_access = buffer1.get_access(cgh); + auto buf_out1_access = buffer2.get_access(cgh); + cgh.parallel_for(numOfItems, [=](s::id<1>WIid) { + buf_out1_access[WIid] = clogf(buf_in1_access[WIid]); + }); + }); + } + + for (size_t idx = 0; idx < 4; ++idx) { + assert(approx_equal_c99_cmplxf(buf_out1[idx], ref_results1[idx])); + } +} + +class DeviceComplexSin; + +void device_c99_complex_sin(s::queue &deviceQueue) { + float __complex__ buf_in1[2] = {CMPLXF(0, 0), CMPLXF(M_PI_2, 0)}; + float __complex__ buf_out1[2]; + float __complex__ ref_results1[2] = {CMPLXF(0, 0), CMPLXF(1, 0)}; + s::range<1> numOfItems{2}; + { + s::buffer buffer1(buf_in1, numOfItems); + s::buffer buffer2(buf_out1, numOfItems); + deviceQueue.submit([&](s::handler &cgh) { + auto buf_in1_access = buffer1.get_access(cgh); + auto buf_out1_access = buffer2.get_access(cgh); + cgh.parallel_for(numOfItems, [=](s::id<1>WIid) { + buf_out1_access[WIid] = csinf(buf_in1_access[WIid]); + }); + }); + } + + for (size_t idx = 0; idx < 2; ++idx) { + assert(approx_equal_c99_cmplxf(buf_out1[idx], ref_results1[idx])); + } +} + +class DeviceComplexCos; + +void device_c99_complex_cos(s::queue &deviceQueue) { + float __complex__ buf_in1[2] = {CMPLXF(0, 0), CMPLXF(M_PI, 0)}; + float __complex__ buf_out1[2]; + float __complex__ ref_results1[2] = {CMPLXF(1, 0), CMPLXF(-1, 0)}; + s::range<1> numOfItems{2}; + { + s::buffer buffer1(buf_in1, numOfItems); + s::buffer buffer2(buf_out1, numOfItems); + deviceQueue.submit([&](s::handler &cgh) { + auto buf_in1_access = buffer1.get_access(cgh); + auto buf_out1_access = buffer2.get_access(cgh); + cgh.parallel_for(numOfItems, [=](s::id<1>WIid) { + buf_out1_access[WIid] = ccosf(buf_in1_access[WIid]); + }); + }); + } + + for (size_t idx = 0; idx < 2; ++idx) { + assert(approx_equal_c99_cmplxf(buf_out1[idx], ref_results1[idx])); + } +} + +void device_c99_complex_test(s::queue &deviceQueue) { + device_c99_complex_times(deviceQueue); + device_c99_complex_divides(deviceQueue); + device_c99_complex_sqrt(deviceQueue); + device_c99_complex_abs(deviceQueue); + device_c99_complex_exp(deviceQueue); + device_c99_complex_log(deviceQueue); + device_c99_complex_sin(deviceQueue); + device_c99_complex_cos(deviceQueue); +} + +int main() { + s::queue deviceQueue; + device_c99_complex_test(deviceQueue); + std::cout << "Pass" << std::endl; +} diff --git a/SYCL/Basic/devicelib/cmath_fp64_test.cpp b/SYCL/Basic/devicelib/cmath_fp64_test.cpp new file mode 100644 index 0000000000..27da0dd11c --- /dev/null +++ b/SYCL/Basic/devicelib/cmath_fp64_test.cpp @@ -0,0 +1,118 @@ +// UNSUPPORTED: windows +// RUN: %clangxx -fsycl -c %s -o %t.o +// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/libsycl-cmath-fp64.o -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// REQUIRES: host, cpu, accelerator + +#include +#include +#include +#include "math_utils.hpp" + +namespace s = cl::sycl; +constexpr s::access::mode sycl_read = s::access::mode::read; +constexpr s::access::mode sycl_write = s::access::mode::write; + +#define TEST_NUM 38 + +double ref[TEST_NUM] = { +1, 0, 0, 0, 0, 0, 0, 1, 1, 0.5, +0, 2, 0, 0, 1, 0, 2, 0, 0, 0, +0, 0, 1, 0, 1, 2, 0, 1, 2, 5, +0, 0, 0, 0, 0.5, 0.5, NAN, NAN,}; + +double refIptr = 1; + +template +void device_cmath_test(s::queue &deviceQueue) { + s::range<1> numOfItems{TEST_NUM}; + T result[TEST_NUM] = {-1}; + + // Variable exponent is an integer value to store the exponent in frexp function + int exponent = -1; + + // Variable iptr stores the integral part of float point in modf function + T iptr = -1; + + // Variable quo stores the sign and some bits of x/y in remquo function + int quo = -1; + { + s::buffer buffer1(result, numOfItems); + s::buffer buffer2(&exponent, s::range<1>{1}); + s::buffer buffer3(&iptr, s::range<1>{1}); + s::buffer buffer4(&quo, s::range<1>{1}); + deviceQueue.submit([&](cl::sycl::handler &cgh) { + auto res_access = buffer1.template get_access(cgh); + auto exp_access = buffer2.template get_access(cgh); + auto iptr_access = buffer3.template get_access(cgh); + auto quo_access = buffer4.template get_access(cgh); + cgh.single_task([=]() { + int i = 0; + res_access[i++] = std::cos(0.0); + res_access[i++] = std::sin(0.0); + res_access[i++] = std::log(1.0); + res_access[i++] = std::acos(1.0); + res_access[i++] = std::asin(0.0); + res_access[i++] = std::atan(0.0); + res_access[i++] = std::atan2(0.0, 1.0); + res_access[i++] = std::cosh(0.0); + res_access[i++] = std::exp(0.0); + res_access[i++] = std::fmod(1.5, 1.0); + res_access[i++] = std::frexp(0.0, &exp_access[0]); + res_access[i++] = std::ldexp(1.0, 1); + res_access[i++] = std::log10(1.0); + res_access[i++] = std::modf(1.0, &iptr_access[0]); + res_access[i++] = std::pow(1.0, 1.0); + res_access[i++] = std::sinh(0.0); + res_access[i++] = std::sqrt(4.0); + res_access[i++] = std::tan(0.0); + res_access[i++] = std::tanh(0.0); + res_access[i++] = std::acosh(1.0); + res_access[i++] = std::asinh(0.0); + res_access[i++] = std::atanh(0.0); + res_access[i++] = std::cbrt(1.0); + res_access[i++] = std::erf(0.0); + res_access[i++] = std::erfc(0.0); + res_access[i++] = std::exp2(1.0); + res_access[i++] = std::expm1(0.0); + res_access[i++] = std::fdim(1.0, 0.0); + res_access[i++] = std::fma(1.0, 1.0, 1.0); + res_access[i++] = std::hypot(3.0, 4.0); + res_access[i++] = std::ilogb(1.0); + res_access[i++] = std::log1p(0.0); + res_access[i++] = std::log2(1.0); + res_access[i++] = std::logb(1.0); + res_access[i++] = std::remainder(0.5, 1.0); + res_access[i++] = std::remquo(0.5, 1.0, &quo_access[0]); + T a = NAN; + res_access[i++] = std::tgamma(a); + res_access[i++] = std::lgamma(a); + }); + }); + } + + // Compare result with reference + for (int i = 0; i < TEST_NUM; ++i) { + assert(approx_equal_fp(result[i], ref[i])); + } + + // Test modf integral part + assert(approx_equal_fp(iptr, refIptr)); + + // Test frexp exponent + assert(exponent == 0); + + // Test remquo sign + assert(quo == 0); +} + +int main() { + s::queue deviceQueue; + if (deviceQueue.get_device().has_extension("cl_khr_fp64")) { + device_cmath_test(deviceQueue); + std::cout << "Pass" << std::endl; + } + return 0; +} diff --git a/SYCL/Basic/devicelib/cmath_test.cpp b/SYCL/Basic/devicelib/cmath_test.cpp new file mode 100644 index 0000000000..550830b543 --- /dev/null +++ b/SYCL/Basic/devicelib/cmath_test.cpp @@ -0,0 +1,115 @@ +// UNSUPPORTED: windows +// RUN: %clangxx -fsycl -c %s -o %t.o +// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/libsycl-cmath.o -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// REQUIRES: host, cpu, accelerator +#include +#include +#include +#include "math_utils.hpp" + +namespace s = cl::sycl; +constexpr s::access::mode sycl_read = s::access::mode::read; +constexpr s::access::mode sycl_write = s::access::mode::write; + +#define TEST_NUM 38 + +float ref[TEST_NUM] = { +1, 0, 0, 0, 0, 0, 0, 1, 1, 0.5, +0, 2, 0, 0, 1, 0, 2, 0, 0, 0, +0, 0, 1, 0, 1, 2, 0, 1, 2, 5, +0, 0, 0, 0, 0.5, 0.5, NAN, NAN,}; + +float refIptr = 1; + +template +void device_cmath_test(s::queue &deviceQueue) { + s::range<1> numOfItems{TEST_NUM}; + T result[TEST_NUM] = {-1}; + + // Variable exponent is an integer value to store the exponent in frexp function + int exponent = -1; + + // Variable iptr stores the integral part of float point in modf function + T iptr = -1; + + // Variable quo stores the sign and some bits of x/y in remquo function + int quo = -1; + { + s::buffer buffer1(result, numOfItems); + s::buffer buffer2(&exponent, s::range<1>{1}); + s::buffer buffer3(&iptr, s::range<1>{1}); + s::buffer buffer4(&quo, s::range<1>{1}); + deviceQueue.submit([&](cl::sycl::handler &cgh) { + auto res_access = buffer1.template get_access(cgh); + auto exp_access = buffer2.template get_access(cgh); + auto iptr_access = buffer3.template get_access(cgh); + auto quo_access = buffer4.template get_access(cgh); + cgh.single_task([=]() { + int i = 0; + res_access[i++] = std::cos(0.0f); + res_access[i++] = std::sin(0.0f); + res_access[i++] = std::log(1.0f); + res_access[i++] = std::acos(1.0f); + res_access[i++] = std::asin(0.0f); + res_access[i++] = std::atan(0.0f); + res_access[i++] = std::atan2(0.0f, 1.0f); + res_access[i++] = std::cosh(0.0f); + res_access[i++] = std::exp(0.0f); + res_access[i++] = std::fmod(1.5f, 1.0f); + res_access[i++] = std::frexp(0.0f, &exp_access[0]); + res_access[i++] = std::ldexp(1.0f, 1); + res_access[i++] = std::log10(1.0f); + res_access[i++] = std::modf(1.0f, &iptr_access[0]); + res_access[i++] = std::pow(1.0f, 1.0f); + res_access[i++] = std::sinh(0.0f); + res_access[i++] = std::sqrt(4.0f); + res_access[i++] = std::tan(0.0f); + res_access[i++] = std::tanh(0.0f); + res_access[i++] = std::acosh(1.0f); + res_access[i++] = std::asinh(0.0f); + res_access[i++] = std::atanh(0.0f); + res_access[i++] = std::cbrt(1.0f); + res_access[i++] = std::erf(0.0f); + res_access[i++] = std::erfc(0.0f); + res_access[i++] = std::exp2(1.0f); + res_access[i++] = std::expm1(0.0f); + res_access[i++] = std::fdim(1.0f, 0.0f); + res_access[i++] = std::fma(1.0f, 1.0f, 1.0f); + res_access[i++] = std::hypot(3.0f, 4.0f); + res_access[i++] = std::ilogb(1.0f); + res_access[i++] = std::log1p(0.0f); + res_access[i++] = std::log2(1.0f); + res_access[i++] = std::logb(1.0f); + res_access[i++] = std::remainder(0.5f, 1.0f); + res_access[i++] = std::remquo(0.5f, 1.0f, &quo_access[0]); + T a = NAN; + res_access[i++] = std::tgamma(a); + res_access[i++] = std::lgamma(a); + }); + }); + } + + // Compare result with reference + for (int i = 0; i < TEST_NUM; ++i) { + assert(approx_equal_fp(result[i], ref[i])); + } + + // Test modf integral part + assert(approx_equal_fp(iptr, refIptr)); + + // Test frexp exponent + assert(exponent == 0); + + // Test remquo sign + assert(quo == 0); +} + +int main() { + s::queue deviceQueue; + device_cmath_test(deviceQueue); + std::cout << "Pass" << std::endl; + return 0; +} diff --git a/SYCL/Basic/devicelib/math_fp64_test.cpp b/SYCL/Basic/devicelib/math_fp64_test.cpp new file mode 100644 index 0000000000..4ba48151aa --- /dev/null +++ b/SYCL/Basic/devicelib/math_fp64_test.cpp @@ -0,0 +1,115 @@ +// REQUIRES: (host || cpu || accelerator) && linux +// RUN: %clangxx -fsycl -c %s -o %t.o +// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/libsycl-cmath-fp64.o -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +#include "math_utils.hpp" +#include +#include +#include + +namespace s = cl::sycl; +constexpr s::access::mode sycl_read = s::access::mode::read; +constexpr s::access::mode sycl_write = s::access::mode::write; + +#define TEST_NUM 38 + +double ref_val[TEST_NUM] = { + 1, 0, 0, 0, 0, 0, 0, 1, 1, 0.5, + 0, 2, 0, 0, 1, 0, 2, 0, 0, 0, + 0, 0, 1, 0, 1, 2, 0, 1, 2, 5, + 0, 0, 0, 0, 0.5, 0.5, NAN, NAN}; + +double refIptr = 1; + +void device_math_test(s::queue &deviceQueue) { + s::range<1> numOfItems{TEST_NUM}; + double result[TEST_NUM] = {-1}; + + // Variable exponent is an integer value to store the exponent in frexp function + int exponent = -1; + + // Variable iptr stores the integral part of float point in modf function + double iptr = -1; + + // Variable quo stores the sign and some bits of x/y in remquo function + int quo = -1; + { + s::buffer buffer1(result, numOfItems); + s::buffer buffer2(&exponent, s::range<1>{1}); + s::buffer buffer3(&iptr, s::range<1>{1}); + s::buffer buffer4(&quo, s::range<1>{1}); + deviceQueue.submit([&](cl::sycl::handler &cgh) { + auto res_access = buffer1.template get_access(cgh); + auto exp_access = buffer2.template get_access(cgh); + auto iptr_access = buffer3.template get_access(cgh); + auto quo_access = buffer4.template get_access(cgh); + cgh.single_task([=]() { + int i = 0; + res_access[i++] = cos(0.0); + res_access[i++] = sin(0.0); + res_access[i++] = log(1.0); + res_access[i++] = acos(1.0); + res_access[i++] = asin(0.0); + res_access[i++] = atan(0.0); + res_access[i++] = atan2(0.0, 1.0); + res_access[i++] = cosh(0.0); + res_access[i++] = exp(0.0); + res_access[i++] = fmod(1.5, 1.0); + res_access[i++] = frexp(0.0, &exp_access[0]); + res_access[i++] = ldexp(1.0, 1); + res_access[i++] = log10(1.0); + res_access[i++] = modf(1.0, &iptr_access[0]); + res_access[i++] = pow(1.0, 1.0); + res_access[i++] = sinh(0.0); + res_access[i++] = sqrt(4.0); + res_access[i++] = tan(0.0); + res_access[i++] = tanh(0.0); + res_access[i++] = acosh(1.0); + res_access[i++] = asinh(0.0); + res_access[i++] = atanh(0.0); + res_access[i++] = cbrt(1.0); + res_access[i++] = erf(0.0); + res_access[i++] = erfc(0.0); + res_access[i++] = exp2(1.0); + res_access[i++] = expm1(0.0); + res_access[i++] = fdim(1.0, 0.0); + res_access[i++] = fma(1.0, 1.0, 1.0); + res_access[i++] = hypot(3.0, 4.0); + res_access[i++] = ilogb(1.0); + res_access[i++] = log1p(0.0); + res_access[i++] = log2(1.0); + res_access[i++] = logb(1.0); + res_access[i++] = remainder(0.5, 1.0); + res_access[i++] = remquo(0.5, 1.0, &quo_access[0]); + double a = NAN; + res_access[i++] = tgamma(a); + res_access[i++] = lgamma(a); + }); + }); + } + + // Compare result with reference + for (int i = 0; i < TEST_NUM; ++i) { + assert(approx_equal_fp(result[i], ref_val[i])); + } + + // Test modf integral part + assert(approx_equal_fp(iptr, refIptr)); + + // Test frexp exponent + assert(exponent == 0); + + // Test remquo sign + assert(quo == 0); +} + +int main() { + s::queue deviceQueue; + if (deviceQueue.get_device().has_extension("cl_khr_fp64")) { + device_math_test(deviceQueue); + std::cout << "Pass" << std::endl; + } + return 0; +} diff --git a/SYCL/Basic/devicelib/math_fp64_windows_test.cpp b/SYCL/Basic/devicelib/math_fp64_windows_test.cpp new file mode 100644 index 0000000000..e7cc317429 --- /dev/null +++ b/SYCL/Basic/devicelib/math_fp64_windows_test.cpp @@ -0,0 +1,132 @@ +// REQUIRES: (cpu || host || accelerator) && windows +// RUN: %clangxx -fsycl -c %s -o %t.o +// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/../bin/libsycl-cmath-fp64.o -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +#include "math_utils.hpp" +#include +#include +#include + +namespace s = cl::sycl; +constexpr s::access::mode sycl_read = s::access::mode::read; +constexpr s::access::mode sycl_write = s::access::mode::write; + +#define TEST_NUM 41 + +double ref_val[TEST_NUM] = { + 1, 0, 0, 0, 0, 0, 0, 1, 1, 0.5, + 0, 2, 0, 0, 1, 0, 2, 0, 0, 0, + 0, 0, 1, 0, 1, 2, 0, 1, 2, 5, + 0, 0, 0, 0, 0.5, 0.5, NAN, NAN, 1, 2, 0}; + +double refIptr = 1; + +void device_math_test(s::queue &deviceQueue) { + s::range<1> numOfItems{TEST_NUM}; + double result[TEST_NUM] = {-1}; + + // Variable exponent is an integer value to store the exponent in frexp function + int exponent = -1; + + // Variable iptr stores the integral part of float point in modf function + double iptr = -1; + + // Variable quo stores the sign and some bits of x/y in remquo function + int quo = -1; + + // Varaible enm stores the enum value retured by MSVC function + short enm[2] = {10, 10}; + { + s::buffer buffer1(result, numOfItems); + s::buffer buffer2(&exponent, s::range<1>{1}); + s::buffer buffer3(&iptr, s::range<1>{1}); + s::buffer buffer4(&quo, s::range<1>{1}); + s::buffer buffer5(enm, s::range<1>{2}); + deviceQueue.submit([&](cl::sycl::handler &cgh) { + auto res_access = buffer1.template get_access(cgh); + auto exp_access = buffer2.template get_access(cgh); + auto iptr_access = buffer3.template get_access(cgh); + auto quo_access = buffer4.template get_access(cgh); + auto enm_access = buffer5.template get_access(cgh); + cgh.single_task([=]() { + int i = 0; + res_access[i++] = cos(0.0); + res_access[i++] = sin(0.0); + res_access[i++] = log(1.0); + res_access[i++] = acos(1.0); + res_access[i++] = asin(0.0); + res_access[i++] = atan(0.0); + res_access[i++] = atan2(0.0, 1.0); + res_access[i++] = cosh(0.0); + res_access[i++] = exp(0.0); + res_access[i++] = fmod(1.5, 1.0); + res_access[i++] = frexp(0.0, &exp_access[0]); + res_access[i++] = ldexp(1.0, 1); + res_access[i++] = log10(1.0); + res_access[i++] = modf(1.0, &iptr_access[0]); + res_access[i++] = pow(1.0, 1.0); + res_access[i++] = sinh(0.0); + res_access[i++] = sqrt(4.0); + res_access[i++] = tan(0.0); + res_access[i++] = tanh(0.0); + res_access[i++] = acosh(1.0); + res_access[i++] = asinh(0.0); + res_access[i++] = atanh(0.0); + res_access[i++] = cbrt(1.0); + res_access[i++] = erf(0.0); + res_access[i++] = erfc(0.0); + res_access[i++] = exp2(1.0); + res_access[i++] = expm1(0.0); + res_access[i++] = fdim(1.0, 0.0); + res_access[i++] = fma(1.0, 1.0, 1.0); + res_access[i++] = hypot(3.0, 4.0); + res_access[i++] = ilogb(1.0); + res_access[i++] = log1p(0.0); + res_access[i++] = log2(1.0); + res_access[i++] = logb(1.0); + res_access[i++] = remainder(0.5, 1.0); + res_access[i++] = remquo(0.5, 1.0, &quo_access[0]); + double a = NAN; + res_access[i++] = tgamma(a); + res_access[i++] = lgamma(a); + enm_access[0] = _Dtest(&a); + a = 0.0; + enm_access[1] = _Exp(&a, 1.0, 0); + res_access[i++] = a; + res_access[i++] = _Cosh(0.0, 2.0); + res_access[i++] = _Sinh(0.0, 1.0); + }); + }); + } + + // Compare result with reference + for (int i = 0; i < TEST_NUM; ++i) { + assert(approx_equal_fp(result[i], ref_val[i])); + } + + // Test modf integral part + assert(approx_equal_fp(iptr, refIptr)); + + // Test frexp exponent + assert(exponent == 0); + + // Test remquo sign + assert(quo == 0); + + // Test enum value returned by _Dtest + assert(enm[0] == _NANCODE); + + // Test enum value returned by _Exp + assert(enm[1] == _FINITE); +} + +int main() { + s::queue deviceQueue; + if (deviceQueue.get_device().has_extension("cl_khr_fp64")) { + device_math_test(deviceQueue); + std::cout << "Pass" << std::endl; + } + return 0; +} diff --git a/SYCL/Basic/devicelib/math_override_test.cpp b/SYCL/Basic/devicelib/math_override_test.cpp new file mode 100644 index 0000000000..829fc6360b --- /dev/null +++ b/SYCL/Basic/devicelib/math_override_test.cpp @@ -0,0 +1,49 @@ +// UNSUPPORTED: windows +// RUN: %clangxx -fsycl -c %s -o %t.o +// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/libsycl-cmath.o -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// REQUIRES: host +#include +#include +#include + +#include "math_utils.hpp" +namespace s = cl::sycl; +constexpr s::access::mode sycl_read = s::access::mode::read; +constexpr s::access::mode sycl_write = s::access::mode::write; + +// Dummy function provided by user to override device library +// version. +SYCL_EXTERNAL +extern "C" float sinf(float x) { return x + 100.f; } + +class DeviceTest; + +void device_test() { + s::queue deviceQueue; + s::range<1> numOfItems{1}; + float result_sin = 0; + float result_cos = 0; + { + s::buffer buffer1(&result_sin, numOfItems); + s::buffer buffer2(&result_cos, numOfItems); + deviceQueue.submit([&](s::handler &cgh) { + auto res_access_sin = buffer1.get_access(cgh); + auto res_access_cos = buffer2.get_access(cgh); + cgh.single_task([=]() { + // Should use the sin function defined by user, device + // library version should be ignored here + res_access_sin[0] = sinf(0.f); + res_access_cos[0] = cosf(0.f); + }); + }); + } + + assert(approx_equal_fp(result_sin, 100.f) && approx_equal_fp(result_cos, 1.f)); +} + +int main() { + device_test(); + std::cout << "Pass" << std::endl; + return 0; +} diff --git a/SYCL/Basic/devicelib/math_test.cpp b/SYCL/Basic/devicelib/math_test.cpp new file mode 100644 index 0000000000..94ac5cc307 --- /dev/null +++ b/SYCL/Basic/devicelib/math_test.cpp @@ -0,0 +1,113 @@ +// REQUIRES: ( host || accelerator || cpu ) && linux +// RUN: %clangxx -fsycl -c %s -o %t.o +// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/libsycl-cmath.o -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +#include "math_utils.hpp" +#include +#include +#include + +namespace s = cl::sycl; +constexpr s::access::mode sycl_read = s::access::mode::read; +constexpr s::access::mode sycl_write = s::access::mode::write; + +#define TEST_NUM 38 + +float ref_val[TEST_NUM] = { + 1, 0, 0, 0, 0, 0, 0, 1, 1, 0.5, + 0, 2, 0, 0, 1, 0, 2, 0, 0, 0, + 0, 0, 1, 0, 1, 2, 0, 1, 2, 5, + 0, 0, 0, 0, 0.5, 0.5, NAN, NAN}; + +float refIptr = 1; + +void device_math_test(s::queue &deviceQueue) { + s::range<1> numOfItems{TEST_NUM}; + float result[TEST_NUM] = {-1}; + + // Variable exponent is an integer value to store the exponent in frexp function + int exponent = -1; + + // Variable iptr stores the integral part of float point in modf function + float iptr = -1; + + // Variable quo stores the sign and some bits of x/y in remquo function + int quo = -1; + { + s::buffer buffer1(result, numOfItems); + s::buffer buffer2(&exponent, s::range<1>{1}); + s::buffer buffer3(&iptr, s::range<1>{1}); + s::buffer buffer4(&quo, s::range<1>{1}); + deviceQueue.submit([&](cl::sycl::handler &cgh) { + auto res_access = buffer1.template get_access(cgh); + auto exp_access = buffer2.template get_access(cgh); + auto iptr_access = buffer3.template get_access(cgh); + auto quo_access = buffer4.template get_access(cgh); + cgh.single_task([=]() { + int i = 0; + res_access[i++] = cosf(0.0f); + res_access[i++] = sinf(0.0f); + res_access[i++] = logf(1.0f); + res_access[i++] = acosf(1.0f); + res_access[i++] = asinf(0.0f); + res_access[i++] = atanf(0.0f); + res_access[i++] = atan2f(0.0f, 1.0f); + res_access[i++] = coshf(0.0f); + res_access[i++] = expf(0.0f); + res_access[i++] = fmodf(1.5f, 1.0f); + res_access[i++] = frexpf(0.0f, &exp_access[0]); + res_access[i++] = ldexpf(1.0f, 1); + res_access[i++] = log10f(1.0f); + res_access[i++] = modff(1.0f, &iptr_access[0]); + res_access[i++] = powf(1.0f, 1.0f); + res_access[i++] = sinhf(0.0f); + res_access[i++] = sqrtf(4.0f); + res_access[i++] = tanf(0.0f); + res_access[i++] = tanhf(0.0f); + res_access[i++] = acoshf(1.0f); + res_access[i++] = asinhf(0.0f); + res_access[i++] = atanhf(0.0f); + res_access[i++] = cbrtf(1.0f); + res_access[i++] = erff(0.0f); + res_access[i++] = erfcf(0.0f); + res_access[i++] = exp2f(1.0f); + res_access[i++] = expm1f(0.0f); + res_access[i++] = fdimf(1.0f, 0.0f); + res_access[i++] = fmaf(1.0f, 1.0f, 1.0f); + res_access[i++] = hypotf(3.0f, 4.0f); + res_access[i++] = ilogbf(1.0f); + res_access[i++] = log1pf(0.0f); + res_access[i++] = log2f(1.0f); + res_access[i++] = logbf(1.0f); + res_access[i++] = remainderf(0.5f, 1.0f); + res_access[i++] = remquof(0.5f, 1.0f, &quo_access[0]); + float a = NAN; + res_access[i++] = tgammaf(a); + res_access[i++] = lgammaf(a); + }); + }); + } + + // Compare result with reference + for (int i = 0; i < TEST_NUM; ++i) { + assert(approx_equal_fp(result[i], ref_val[i])); + } + + // Test modf integral part + assert(approx_equal_fp(iptr, refIptr)); + + // Test frexp exponent + assert(exponent == 0); + + // Test remquo sign + assert(quo == 0); +} + +int main() { + s::queue deviceQueue; + device_math_test(deviceQueue); + std::cout << "Pass" << std::endl; + return 0; +} diff --git a/SYCL/Basic/devicelib/math_utils.hpp b/SYCL/Basic/devicelib/math_utils.hpp new file mode 100644 index 0000000000..eb4f5cae07 --- /dev/null +++ b/SYCL/Basic/devicelib/math_utils.hpp @@ -0,0 +1,29 @@ +#ifndef MATH_UTILS +#include +#include + +// Since it is not proper to compare float point using operator ==, this +// function measures whether the result of cmath function from kernel is +// close to the reference and machine epsilon is used as threshold in this +// function. T must be float-point type. +template +bool approx_equal_fp(T x, T y) { + + // At least one input is nan + if (std::isnan(x) || std::isnan(y)) + return std::isnan(x) && std::isnan(y); + + // At least one input is inf + if (std::isinf(x) || std::isinf(y)) + return (x == y); + + // two finite + T threshold = std::numeric_limits::epsilon() * 100; + if (x != 0 && y != 0) { + T max_v = std::fmax(std::abs(x), std::abs(y)); + return std::abs(x - y) < threshold * max_v; + } + return x != 0 ? std::abs(x) < threshold : std::abs(y) < threshold; +} + +#endif diff --git a/SYCL/Basic/devicelib/math_windows_test.cpp b/SYCL/Basic/devicelib/math_windows_test.cpp new file mode 100644 index 0000000000..bce34de651 --- /dev/null +++ b/SYCL/Basic/devicelib/math_windows_test.cpp @@ -0,0 +1,121 @@ +// REQUIRES: (accelerator || cpu || host) && windows +// RUN: %clangxx -fsycl -c %s -o %t.o +// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/../bin/libsycl-cmath.o -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +#include "math_utils.hpp" +#include +#include +#include + +namespace s = cl::sycl; +constexpr s::access::mode sycl_read = s::access::mode::read; +constexpr s::access::mode sycl_write = s::access::mode::write; + +#define TEST_NUM 39 + +float ref_val[TEST_NUM] = { + 1, 0, 0, 0, 0, 0, 0, 1, 1, 0.5, + 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, + 1, 0, 1, 2, 0, 1, 2, 5, 0, 0, + 0, 0, 0.5, 0.5, NAN, NAN, 1, 2, 0}; + +float refIptr = 1; + +void device_math_test(s::queue &deviceQueue) { + s::range<1> numOfItems{TEST_NUM}; + float result[TEST_NUM] = {-1}; + + // Variable iptr stores the integral part of float point in modf function + float iptr = -1; + + // Variable quo stores the sign and some bits of x/y in remquo function + int quo = -1; + + // Varaible enm stores the enum value retured by MSVC function + short enm[2] = {10, 10}; + + { + s::buffer buffer1(result, numOfItems); + s::buffer buffer2(&iptr, s::range<1>{1}); + s::buffer buffer3(&quo, s::range<1>{1}); + s::buffer buffer4(enm, s::range<1>{2}); + deviceQueue.submit([&](cl::sycl::handler &cgh) { + auto res_access = buffer1.template get_access(cgh); + auto iptr_access = buffer2.template get_access(cgh); + auto quo_access = buffer3.template get_access(cgh); + auto enm_access = buffer4.template get_access(cgh); + cgh.single_task([=]() { + int i = 0; + res_access[i++] = cosf(0.0f); + res_access[i++] = sinf(0.0f); + res_access[i++] = logf(1.0f); + res_access[i++] = acosf(1.0f); + res_access[i++] = asinf(0.0f); + res_access[i++] = atanf(0.0f); + res_access[i++] = atan2f(0.0f, 1.0f); + res_access[i++] = coshf(0.0f); + res_access[i++] = expf(0.0f); + res_access[i++] = fmodf(1.5f, 1.0f); + res_access[i++] = log10f(1.0f); + res_access[i++] = modff(1.0f, &iptr_access[0]); + res_access[i++] = powf(1.0f, 1.0f); + res_access[i++] = sinhf(0.0f); + res_access[i++] = sqrtf(4.0f); + res_access[i++] = tanf(0.0f); + res_access[i++] = tanhf(0.0f); + res_access[i++] = acoshf(1.0f); + res_access[i++] = asinhf(0.0f); + res_access[i++] = atanhf(0.0f); + res_access[i++] = cbrtf(1.0f); + res_access[i++] = erff(0.0f); + res_access[i++] = erfcf(0.0f); + res_access[i++] = exp2f(1.0f); + res_access[i++] = expm1f(0.0f); + res_access[i++] = fdimf(1.0f, 0.0f); + res_access[i++] = fmaf(1.0f, 1.0f, 1.0f); + res_access[i++] = hypotf(3.0f, 4.0f); + res_access[i++] = ilogbf(1.0f); + res_access[i++] = log1pf(0.0f); + res_access[i++] = log2f(1.0f); + res_access[i++] = logbf(1.0f); + res_access[i++] = remainderf(0.5f, 1.0f); + res_access[i++] = remquof(0.5f, 1.0f, &quo_access[0]); + float a = NAN; + res_access[i++] = tgammaf(a); + res_access[i++] = lgammaf(a); + enm_access[0] = _FDtest(&a); + a = 0.0f; + enm_access[1] = _FExp(&a, 1.0f, 0); + res_access[i++] = a; + res_access[i++] = _FCosh(0.0f, 2.0f); + res_access[i++] = _FSinh(0.0f, 1.0f); + }); + }); + } + + // Compare result with reference + for (int i = 0; i < TEST_NUM; ++i) { + assert(approx_equal_fp(result[i], ref_val[i])); + } + + // Test modf integral part + assert(approx_equal_fp(iptr, refIptr)); + + // Test remquo sign + assert(quo == 0); + + // Test enum value returned by _FDtest + assert(enm[0] == _NANCODE); + + // Test enum value returned by _FExp + assert(enm[1] == _FINITE); +} + +int main() { + s::queue deviceQueue; + device_math_test(deviceQueue); + std::cout << "Pass" << std::endl; + return 0; +} diff --git a/SYCL/Basic/devicelib/std_complex_math_fp64_test.cpp b/SYCL/Basic/devicelib/std_complex_math_fp64_test.cpp new file mode 100644 index 0000000000..e94bf950e8 --- /dev/null +++ b/SYCL/Basic/devicelib/std_complex_math_fp64_test.cpp @@ -0,0 +1,206 @@ +// UNSUPPORTED: windows +// RUN: %clangxx -fsycl -c %s -o %t.o +// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/libsycl-complex-fp64.o %sycl_libs_dir/libsycl-cmath-fp64.o -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// XFAIL:cpu +// REQUIRES: host, cpu, accelerator + +#include +#include +#include +#include + +#include "math_utils.hpp" + +using std::complex; +namespace s = cl::sycl; +constexpr s::access::mode sycl_read = s::access::mode::read; +constexpr s::access::mode sycl_write = s::access::mode::write; + +template +bool approx_equal_cmplx(complex x, complex y) { + return approx_equal_fp(x.real(), y.real()) && + approx_equal_fp(x.imag(), y.imag()); +} + +static constexpr auto TestArraySize1 = 57; +static constexpr auto TestArraySize2 = 10; + +std::array, TestArraySize1> ref1_results = { + complex(-1., 1.), + complex(1., 3.), + complex(-2., 10.), + complex(-8., 31.), + complex(1., 1.), + complex(2., 1.), + complex(2., 2.), + complex(3., 4.), + complex(2., 1.), + complex(0., 1.), + complex(2., 0.), + complex(0., 0.), + complex(0., 1.), + complex(1., 1.), + complex(2., 0.), + complex(2., 3.), + complex(1., 0.), + complex(0., 1.), + complex(-1., 0.), + complex(0., M_E), + complex(0., 0.), + complex(0., M_PI_2), + complex(0., M_PI), + complex(1., M_PI_2), + complex(0., 0.), + complex(1., 0.), + complex(1., 0.), + complex(-1., 0.), + complex(-INFINITY, 0.), + complex(1., 0.), + complex(10., 0.), + complex(100., 0.), + complex(200., 0.), + complex(1., 2.), + complex(INFINITY, 0.), + complex(INFINITY, 0.), + complex(0., 1.), + complex(M_PI_2, 0.), + complex(0., 0.), + complex(1., 0.), + complex(INFINITY, 0.), + complex(0., 0.), + complex(1., 0.), + complex(0., 0.), + complex(INFINITY, M_PI_2), + complex(INFINITY, 0.), + complex(0., M_PI_2), + complex(INFINITY, M_PI_2), + complex(INFINITY, 0.), + complex(0., 0.), + complex(0., M_PI_2), + + complex(1., -4.), + complex(18., -7.), + complex(1.557407724654902, 0.), + complex(0, 0.761594155955765), + complex(M_PI_2, 0.), + complex(M_PI_2, 0.549306144334055)}; + +std::array ref2_results = {0., 25., 169., INFINITY, 0., + 5., 13., INFINITY, 0., M_PI_2}; + +void device_complex_test(s::queue &deviceQueue) { + s::range<1> numOfItems1{TestArraySize1}; + s::range<1> numOfItems2{TestArraySize2}; + std::array, TestArraySize1> result1; + std::array result2; + { + s::buffer, 1> buffer1(result1.data(), numOfItems1); + s::buffer buffer2(result2.data(), numOfItems2); + deviceQueue.submit([&](s::handler &cgh) { + auto buf_out1_access = buffer1.get_access(cgh); + auto buf_out2_access = buffer2.get_access(cgh); + cgh.single_task([=]() { + int index = 0; + buf_out1_access[index++] = + complex(0., 1.) * complex(1., 1.); + buf_out1_access[index++] = + complex(1., 1.) * complex(2., 1.); + buf_out1_access[index++] = + complex(2., 3.) * complex(2., 2.); + buf_out1_access[index++] = + complex(4., 5.) * complex(3., 4.); + buf_out1_access[index++] = + complex(-1., 1.) / complex(0., 1.); + buf_out1_access[index++] = + complex(1., 3.) / complex(1., 1.); + buf_out1_access[index++] = + complex(-2., 10.) / complex(2., 3.); + buf_out1_access[index++] = + complex(-8., 31.) / complex(4., 5.); + buf_out1_access[index++] = + complex(4., 2.) / complex(2., 0.); + buf_out1_access[index++] = + complex(-1., 0.) / complex(0., 1.); + buf_out1_access[index++] = + complex(0., 10.) / complex(0., 5.); + buf_out1_access[index++] = + complex(0., 0.) / complex(1., 0.); + buf_out1_access[index++] = std::sqrt(complex(-1., 0.)); + buf_out1_access[index++] = std::sqrt(complex(0., 2.)); + buf_out1_access[index++] = std::sqrt(complex(4., 0.)); + buf_out1_access[index++] = std::sqrt(complex(-5., 12.)); + buf_out1_access[index++] = std::exp(complex(0., 0.)); + buf_out1_access[index++] = std::exp(complex(0., M_PI_2)); + buf_out1_access[index++] = std::exp(complex(0., M_PI)); + buf_out1_access[index++] = std::exp(complex(1., M_PI_2)); + buf_out1_access[index++] = std::log(complex(1., 0.)); + buf_out1_access[index++] = std::log(complex(0., 1.)); + buf_out1_access[index++] = std::log(complex(-1., 0.)); + buf_out1_access[index++] = std::log(complex(0., M_E)); + buf_out1_access[index++] = std::sin(complex(0., 0.)); + buf_out1_access[index++] = std::sin(complex(M_PI_2, 0.)); + buf_out1_access[index++] = std::cos(complex(0., 0.)); + buf_out1_access[index++] = std::cos(complex(M_PI, 0.)); + buf_out1_access[index++] = std::log10(complex(0., 0.)); + buf_out1_access[index++] = std::polar(1.); + buf_out1_access[index++] = std::polar(10., 0.); + buf_out1_access[index++] = std::polar(100.); + buf_out1_access[index++] = std::polar(200., 0.); + buf_out1_access[index++] = std::proj(complex(1., 2.)); + buf_out1_access[index++] = std::proj(complex(INFINITY, -1.)); + buf_out1_access[index++] = std::proj(complex(0., -INFINITY)); + buf_out1_access[index++] = std::pow(complex(-1., 0.), 0.5); + buf_out1_access[index++] = std::acos(complex(0., 0.)); + buf_out1_access[index++] = std::sinh(complex(0., 0.)); + buf_out1_access[index++] = std::cosh(complex(0., 0.)); + buf_out1_access[index++] = std::cosh(complex(INFINITY, 0.)); + buf_out1_access[index++] = std::tanh(complex(0., 0.)); + buf_out1_access[index++] = std::tanh(complex(INFINITY, 1.)); + buf_out1_access[index++] = std::asinh(complex(0., 0.)); + buf_out1_access[index++] = std::asinh(complex(1., INFINITY)); + buf_out1_access[index++] = std::asinh(complex(INFINITY, 1.)); + buf_out1_access[index++] = std::acosh(complex(0., 0.)); + buf_out1_access[index++] = std::acosh(complex(1., INFINITY)); + buf_out1_access[index++] = std::acosh(complex(INFINITY, 1.)); + buf_out1_access[index++] = std::atanh(complex(0., 0.)); + buf_out1_access[index++] = std::atanh(complex(1., INFINITY)); + buf_out1_access[index++] = std::conj(complex(1., 4.)); + buf_out1_access[index++] = std::conj(complex(18., 7.)); + buf_out1_access[index++] = std::tan(complex(1., 0.)); + buf_out1_access[index++] = std::tan(complex(0., 1.)); + buf_out1_access[index++] = std::asin(complex(1., 0.)); + buf_out1_access[index++] = std::atan(complex(0., 2.)); + + index = 0; + buf_out2_access[index++] = std::norm(complex(0., 0.)); + buf_out2_access[index++] = std::norm(complex(3., 4.)); + buf_out2_access[index++] = std::norm(complex(12., 5.)); + buf_out2_access[index++] = std::norm(complex(INFINITY, 1.)); + buf_out2_access[index++] = std::abs(complex(0., 0.)); + buf_out2_access[index++] = std::abs(complex(3., 4.)); + buf_out2_access[index++] = std::abs(complex(12., 5.)); + buf_out2_access[index++] = std::abs(complex(INFINITY, 1.)); + buf_out2_access[index++] = std::arg(complex(1., 0.)); + buf_out2_access[index++] = std::arg(complex(0., 1.)); + }); + }); + } + + for (size_t idx = 0; idx < TestArraySize1; ++idx) { + assert(approx_equal_cmplx(result1[idx], ref1_results[idx])); + } + for (size_t idx = 0; idx < TestArraySize2; ++idx) { + assert(approx_equal_fp(result2[idx], ref2_results[idx])); + } +} + +int main() { + s::queue deviceQueue; + if (deviceQueue.get_device().has_extension("cl_khr_fp64")) { + device_complex_test(deviceQueue); + std::cout << "Pass" << std::endl; + } +} diff --git a/SYCL/Basic/devicelib/std_complex_math_test.cpp b/SYCL/Basic/devicelib/std_complex_math_test.cpp new file mode 100644 index 0000000000..c8e585ef6c --- /dev/null +++ b/SYCL/Basic/devicelib/std_complex_math_test.cpp @@ -0,0 +1,204 @@ +// UNSUPPORTED: windows +// RUN: %clangxx -fsycl -c %s -o %t.o +// RUN: %clangxx -fsycl %t.o %sycl_libs_dir/libsycl-complex.o %sycl_libs_dir/libsycl-cmath.o -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// XFAIL:cpu +// REQUIRES: host, cpu, accelerator + +#include +#include +#include +#include + +#include "math_utils.hpp" + +using std::complex; +namespace s = cl::sycl; +constexpr s::access::mode sycl_read = s::access::mode::read; +constexpr s::access::mode sycl_write = s::access::mode::write; + +template +bool approx_equal_cmplx(complex x, complex y) { + return approx_equal_fp(x.real(), y.real()) && + approx_equal_fp(x.imag(), y.imag()); +} + +static constexpr auto TestArraySize1 = 57; +static constexpr auto TestArraySize2 = 10; + +std::array, TestArraySize1> ref1_results = { + complex(-1.f, 1.f), + complex(1.f, 3.f), + complex(-2.f, 10.f), + complex(-8.f, 31.f), + complex(1.f, 1.f), + complex(2.f, 1.f), + complex(2.f, 2.f), + complex(3.f, 4.f), + complex(2.f, 1.f), + complex(0.f, 1.f), + complex(2.f, 0.f), + complex(0.f, 0.f), + complex(0.f, 1.f), + complex(1.f, 1.f), + complex(2.f, 0.f), + complex(2.f, 3.f), + complex(1.f, 0.f), + complex(0.f, 1.f), + complex(-1.f, 0.f), + complex(0.f, M_E), + complex(0.f, 0.f), + complex(0.f, M_PI_2), + complex(0.f, M_PI), + complex(1.f, M_PI_2), + complex(0.f, 0.f), + complex(1.f, 0.f), + complex(1.f, 0.f), + complex(-1.f, 0.f), + complex(-INFINITY, 0.f), + complex(1.f, 0.f), + complex(10.f, 0.f), + complex(100.f, 0.f), + complex(200.f, 0.f), + complex(1.f, 2.f), + complex(INFINITY, 0.f), + complex(INFINITY, 0.f), + complex(0.f, 1.f), + complex(M_PI_2, 0.f), + complex(0.f, 0.f), + complex(1.f, 0.f), + complex(INFINITY, 0.f), + complex(0.f, 0.f), + complex(1.f, 0.f), + complex(0.f, 0.f), + complex(INFINITY, M_PI_2), + complex(INFINITY, 0.f), + complex(0.f, M_PI_2), + complex(INFINITY, M_PI_2), + complex(INFINITY, 0.f), + complex(0.f, 0.f), + complex(0.f, M_PI_2), + + complex(1.f, -4.f), + complex(18.f, -7.f), + complex(1.557408f, 0.f), + complex(0.f, 0.761594f), + complex(M_PI_2, 0.f), + complex(M_PI_2, 0.549306f)}; + +std::array ref2_results = {0.f, 25.f, 169.f, INFINITY, 0.f, + 5.f, 13.f, INFINITY, 0.f, M_PI_2}; + +void device_complex_test(s::queue &deviceQueue) { + s::range<1> numOfItems1{TestArraySize1}; + s::range<1> numOfItems2{TestArraySize2}; + std::array, TestArraySize1> result1; + std::array result2; + { + s::buffer, 1> buffer1(result1.data(), numOfItems1); + s::buffer buffer2(result2.data(), numOfItems2); + deviceQueue.submit([&](s::handler &cgh) { + auto buf_out1_access = buffer1.get_access(cgh); + auto buf_out2_access = buffer2.get_access(cgh); + cgh.single_task([=]() { + int index = 0; + buf_out1_access[index++] = + complex(0.f, 1.f) * complex(1.f, 1.f); + buf_out1_access[index++] = + complex(1.f, 1.f) * complex(2.f, 1.f); + buf_out1_access[index++] = + complex(2.f, 3.f) * complex(2.f, 2.f); + buf_out1_access[index++] = + complex(4.f, 5.f) * complex(3.f, 4.f); + buf_out1_access[index++] = + complex(-1.f, 1.f) / complex(0.f, 1.f); + buf_out1_access[index++] = + complex(1.f, 3.f) / complex(1.f, 1.f); + buf_out1_access[index++] = + complex(-2.f, 10.f) / complex(2.f, 3.f); + buf_out1_access[index++] = + complex(-8.f, 31.f) / complex(4.f, 5.f); + buf_out1_access[index++] = + complex(4.f, 2.f) / complex(2.f, 0.f); + buf_out1_access[index++] = + complex(-1.f, 0.f) / complex(0.f, 1.f); + buf_out1_access[index++] = + complex(0.f, 10.f) / complex(0.f, 5.f); + buf_out1_access[index++] = + complex(0.f, 0.f) / complex(1.f, 0.f); + buf_out1_access[index++] = std::sqrt(complex(-1.f, 0.f)); + buf_out1_access[index++] = std::sqrt(complex(0.f, 2.f)); + buf_out1_access[index++] = std::sqrt(complex(4.f, 0.f)); + buf_out1_access[index++] = std::sqrt(complex(-5.f, 12.f)); + buf_out1_access[index++] = std::exp(complex(0.f, 0.f)); + buf_out1_access[index++] = std::exp(complex(0.f, M_PI_2)); + buf_out1_access[index++] = std::exp(complex(0.f, M_PI)); + buf_out1_access[index++] = std::exp(complex(1.f, M_PI_2)); + buf_out1_access[index++] = std::log(complex(1.f, 0.f)); + buf_out1_access[index++] = std::log(complex(0.f, 1.f)); + buf_out1_access[index++] = std::log(complex(-1.f, 0.f)); + buf_out1_access[index++] = std::log(complex(0.f, M_E)); + buf_out1_access[index++] = std::sin(complex(0.f, 0.f)); + buf_out1_access[index++] = std::sin(complex(M_PI_2, 0.f)); + buf_out1_access[index++] = std::cos(complex(0.f, 0.f)); + buf_out1_access[index++] = std::cos(complex(M_PI, 0.f)); + buf_out1_access[index++] = std::log10(complex(0.f, 0.f)); + buf_out1_access[index++] = std::polar(1.f); + buf_out1_access[index++] = std::polar(10.f, 0.f); + buf_out1_access[index++] = std::polar(100.f); + buf_out1_access[index++] = std::polar(200.f, 0.f); + buf_out1_access[index++] = std::proj(complex(1.f, 2.f)); + buf_out1_access[index++] = std::proj(complex(INFINITY, -1.f)); + buf_out1_access[index++] = std::proj(complex(0.f, -INFINITY)); + buf_out1_access[index++] = std::pow(complex(-1.f, 0.f), 0.5f); + buf_out1_access[index++] = std::acos(complex(0.f, 0.f)); + buf_out1_access[index++] = std::sinh(complex(0.f, 0.f)); + buf_out1_access[index++] = std::cosh(complex(0.f, 0.f)); + buf_out1_access[index++] = std::cosh(complex(INFINITY, 0.f)); + buf_out1_access[index++] = std::tanh(complex(0.f, 0.f)); + buf_out1_access[index++] = std::tanh(complex(INFINITY, 1.f)); + buf_out1_access[index++] = std::asinh(complex(0.f, 0.f)); + buf_out1_access[index++] = std::asinh(complex(1.f, INFINITY)); + buf_out1_access[index++] = std::asinh(complex(INFINITY, 1.f)); + buf_out1_access[index++] = std::acosh(complex(0.f, 0.f)); + buf_out1_access[index++] = std::acosh(complex(1.f, INFINITY)); + buf_out1_access[index++] = std::acosh(complex(INFINITY, 1.f)); + buf_out1_access[index++] = std::atanh(complex(0.f, 0.f)); + buf_out1_access[index++] = std::atanh(complex(1.f, INFINITY)); + buf_out1_access[index++] = std::conj(complex(1.f, 4.f)); + buf_out1_access[index++] = std::conj(complex(18.f, 7.f)); + buf_out1_access[index++] = std::tan(complex(1.f, 0.f)); + buf_out1_access[index++] = std::tan(complex(0.f, 1.f)); + buf_out1_access[index++] = std::asin(complex(1.f, 0.f)); + buf_out1_access[index++] = std::atan(complex(0.f, 2.f)); + + index = 0; + buf_out2_access[index++] = std::norm(complex(0.f, 0.f)); + buf_out2_access[index++] = std::norm(complex(3.f, 4.f)); + buf_out2_access[index++] = std::norm(complex(12.f, 5.f)); + buf_out2_access[index++] = std::norm(complex(INFINITY, 1.f)); + buf_out2_access[index++] = std::abs(complex(0.f, 0.f)); + buf_out2_access[index++] = std::abs(complex(3.f, 4.f)); + buf_out2_access[index++] = std::abs(complex(12.f, 5.f)); + buf_out2_access[index++] = std::abs(complex(INFINITY, 1.f)); + buf_out2_access[index++] = std::arg(complex(1.f, 0.f)); + buf_out2_access[index++] = std::arg(complex(0.f, 1.f)); + }); + }); + } + + for (size_t idx = 0; idx < TestArraySize1; ++idx) { + assert(approx_equal_cmplx(result1[idx], ref1_results[idx])); + } + for (size_t idx = 0; idx < TestArraySize2; ++idx) { + assert(approx_equal_fp(result2[idx], ref2_results[idx])); + } +} + +int main() { + s::queue deviceQueue; + device_complex_test(deviceQueue); + std::cout << "Pass" << std::endl; +} diff --git a/SYCL/Basic/enqueue_barrier/enqueue_barrier.cpp b/SYCL/Basic/enqueue_barrier/enqueue_barrier.cpp new file mode 100644 index 0000000000..5a1f72ca46 --- /dev/null +++ b/SYCL/Basic/enqueue_barrier/enqueue_barrier.cpp @@ -0,0 +1,78 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: env SYCL_PI_TRACE=2 %CPU_RUN_PLACEHOLDER %t.out 2>&1 %CPU_CHECK_PLACEHOLDER +// RUN: env SYCL_PI_TRACE=2 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER +// RUN: env SYCL_PI_TRACE=2 %ACC_RUN_PLACEHOLDER %t.out 2>&1 %ACC_CHECK_PLACEHOLDER +// REQUIRES: cpu, gpu, accelerator +// UNSUPPORTED: cuda + +#include +#include + +int main() { + sycl::context Context; + sycl::queue Q1(Context, sycl::default_selector{}); + + Q1.submit([&](sycl::handler &cgh) { + cgh.single_task([]() {}); + }); + Q1.submit([&](sycl::handler &cgh) { + cgh.single_task([]() {}); + }); + + // call handler::barrier() + Q1.submit([&](sycl::handler &cgh) { + cgh.barrier(); + }); + + Q1.submit([&](sycl::handler &cgh) { + cgh.single_task([]() {}); + }); + Q1.submit([&](sycl::handler &cgh) { + cgh.single_task([]() {}); + }); + + // call queue::submit_barrier() + Q1.submit_barrier(); + + sycl::queue Q2(Context, sycl::default_selector{}); + sycl::queue Q3(Context, sycl::default_selector{}); + + auto Event1 = Q1.submit([&](sycl::handler &cgh) { + cgh.single_task([]() {}); + }); + + auto Event2 = Q2.submit([&](sycl::handler &cgh) { + cgh.single_task([]() {}); + }); + + // call handler::barrier(const vector_class &WaitList) + Q3.submit([&](cl::sycl::handler &cgh) { + cgh.barrier({Event1, Event2}); + }); + + Q3.submit([&](sycl::handler &cgh) { + cgh.single_task([]() {}); + }); + + auto Event3 = Q1.submit([&](sycl::handler &cgh) { + cgh.single_task([]() {}); + }); + + auto Event4 = Q2.submit([&](sycl::handler &cgh) { + cgh.single_task([]() {}); + }); + + // call queue::submit_barrier(const vector_class &WaitList) + Q3.submit_barrier({Event3, Event4}); + + Q3.submit([&](sycl::handler &cgh) { + cgh.single_task([]() {}); + }); + + return 0; +} + +// CHECK:---> piEnqueueEventsWaitWithBarrier +// CHECK:---> piEnqueueEventsWaitWithBarrier +// CHECK:---> piEnqueueEventsWaitWithBarrier +// CHECK:---> piEnqueueEventsWaitWithBarrier diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_16_empty.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_16_empty.cpp new file mode 100644 index 0000000000..ad4285e8ec --- /dev/null +++ b/SYCL/Basic/feature-tests/inline-asm/asm_16_empty.cpp @@ -0,0 +1,40 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include + +using dataType = cl::sycl::cl_int; + +template +struct KernelFunctor : WithOutputBuffer { + KernelFunctor(size_t problem_size) : WithOutputBuffer(problem_size) {} + + void operator()(cl::sycl::handler &cgh) { + auto C = this->getOutputBuffer().template get_access(cgh); + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] { + C[wiID] = 43; +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm volatile(""); +#endif + }); + } +}; + +int main() { + KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); + if (!launchInlineASMTest(f)) + return 0; + + if (verify_all_the_same(f.getOutputBufferData(), 43)) + return 0; + + return 1; +} diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_16_matrix_mult.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_16_matrix_mult.cpp new file mode 100644 index 0000000000..6ae1debb67 --- /dev/null +++ b/SYCL/Basic/feature-tests/inline-asm/asm_16_matrix_mult.cpp @@ -0,0 +1,44 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include + +using dataType = cl::sycl::cl_int; + +template +struct KernelFunctor : WithOutputBuffer { + KernelFunctor(size_t problem_size) : WithOutputBuffer(problem_size) {} + + void operator()(cl::sycl::handler &cgh) { + auto C = this->getOutputBuffer().template get_access(cgh); + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] { + volatile int output = 0; +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d" + : "=rw"(output)); +#else + output = 7; +#endif + C[wiID] = output; + }); + } +}; + +int main() { + KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); + if (!launchInlineASMTest(f)) + return 0; + + if (verify_all_the_same(f.getOutputBufferData(), 7)) + return 0; + + return 1; +} diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_16_no_input_int.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_16_no_input_int.cpp new file mode 100644 index 0000000000..6ae1debb67 --- /dev/null +++ b/SYCL/Basic/feature-tests/inline-asm/asm_16_no_input_int.cpp @@ -0,0 +1,44 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include + +using dataType = cl::sycl::cl_int; + +template +struct KernelFunctor : WithOutputBuffer { + KernelFunctor(size_t problem_size) : WithOutputBuffer(problem_size) {} + + void operator()(cl::sycl::handler &cgh) { + auto C = this->getOutputBuffer().template get_access(cgh); + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] { + volatile int output = 0; +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d" + : "=rw"(output)); +#else + output = 7; +#endif + C[wiID] = output; + }); + } +}; + +int main() { + KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); + if (!launchInlineASMTest(f)) + return 0; + + if (verify_all_the_same(f.getOutputBufferData(), 7)) + return 0; + + return 1; +} diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_16_no_opts.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_16_no_opts.cpp new file mode 100644 index 0000000000..4b6d5146fd --- /dev/null +++ b/SYCL/Basic/feature-tests/inline-asm/asm_16_no_opts.cpp @@ -0,0 +1,45 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include + +using dataType = cl::sycl::cl_int; + +template +struct KernelFunctor : WithOutputBuffer { + KernelFunctor(size_t problem_size) : WithOutputBuffer(problem_size) {} + + void operator()(cl::sycl::handler &cgh) { + auto C = this->getOutputBuffer().template get_access(cgh); + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] { + for (int i = 0; i < 10; ++i) { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("fence_sw"); + C[wiID] += i; + +#else + C[wiID] += i; +#endif + } + }); + } +}; + +int main() { + KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); + if (!launchInlineASMTest(f)) + return 0; + + if (verify_all_the_same(f.getOutputBufferData(), 45)) + return 0; + + return 1; +} diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_8_empty.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_8_empty.cpp new file mode 100644 index 0000000000..97fae0ed4e --- /dev/null +++ b/SYCL/Basic/feature-tests/inline-asm/asm_8_empty.cpp @@ -0,0 +1,40 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include + +using dataType = cl::sycl::cl_int; + +template +struct KernelFunctor : WithOutputBuffer { + KernelFunctor(size_t problem_size) : WithOutputBuffer(problem_size) {} + + void operator()(cl::sycl::handler &cgh) { + auto C = this->getOutputBuffer().template get_access(cgh); + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { + C[wiID] = 43; +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm volatile(""); +#endif + }); + } +}; + +int main() { + KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); + if (!launchInlineASMTest(f)) + return 0; + + if (verify_all_the_same(f.getOutputBufferData(), 43)) + return 0; + + return 1; +} diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_8_no_input_int.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_8_no_input_int.cpp new file mode 100644 index 0000000000..6d1dcbb832 --- /dev/null +++ b/SYCL/Basic/feature-tests/inline-asm/asm_8_no_input_int.cpp @@ -0,0 +1,44 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include + +using dataType = cl::sycl::cl_int; + +template +struct KernelFunctor : WithOutputBuffer { + KernelFunctor(size_t problem_size) : WithOutputBuffer(problem_size) {} + + void operator()(cl::sycl::handler &cgh) { + auto C = this->getOutputBuffer().template get_access(cgh); + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { + volatile int output = 0; +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm volatile("mov (M1,8) %0(0,0)<1> 0x7:d" + : "=rw"(output)); +#else + output = 7; +#endif + C[wiID] = output; + }); + } +}; + +int main() { + KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); + if (!launchInlineASMTest(f)) + return 0; + + if (verify_all_the_same(f.getOutputBufferData(), 7)) + return 0; + + return 1; +} diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_arbitrary_ops_order.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_arbitrary_ops_order.cpp new file mode 100644 index 0000000000..28d0af1d45 --- /dev/null +++ b/SYCL/Basic/feature-tests/inline-asm/asm_arbitrary_ops_order.cpp @@ -0,0 +1,59 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include + +using dataType = cl::sycl::cl_int; + +template +struct KernelFunctor : WithInputBuffers, WithOutputBuffer { + KernelFunctor(const std::vector &input1, const std::vector &input2, const std::vector &input3) : WithInputBuffers(input1, input2, input3), WithOutputBuffer(input1.size()) {} + + void operator()(cl::sycl::handler &cgh) { + auto A = this->getInputBuffer(0).template get_access(cgh); + auto B = this->getInputBuffer(1).template get_access(cgh); + auto C = this->getInputBuffer(2).template get_access(cgh); + auto D = this->getOutputBuffer().template get_access(cgh); + + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("mad (M1, 8) %0(0, 0)<1> %3(0, 0)<1;1,0> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>" + : "=rw"(D[wiID]) + : "rw"(B[wiID]), "rw"(C[wiID]), "rw"(A[wiID])); +#else + D[wiID] = A[wiID] * B[wiID] + C[wiID]; +#endif + }); + } +}; + +int main() { + std::vector inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE), inputC(DEFAULT_PROBLEM_SIZE); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) { + inputA[i] = i; + inputB[i] = i; + inputC[i] = DEFAULT_PROBLEM_SIZE - i * i; + } + + KernelFunctor<> f(inputA, inputB, inputC); + if (!launchInlineASMTest(f)) + return 0; + + auto &D = f.getOutputBufferData(); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) { + if (D[i] != inputA[i] * inputB[i] + inputC[i]) { + std::cerr << "At index: " << i << ". "; + std::cerr << D[i] << " != " << inputA[i] * inputB[i] + inputC[i] << "\n"; + return 1; + } + } + return 0; +} diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_decl_in_scope.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_decl_in_scope.cpp new file mode 100644 index 0000000000..db30e20f5e --- /dev/null +++ b/SYCL/Basic/feature-tests/inline-asm/asm_decl_in_scope.cpp @@ -0,0 +1,67 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include + +using dataType = cl::sycl::cl_int; + +template +struct KernelFunctor : WithInputBuffers, WithOutputBuffer { + KernelFunctor(const std::vector &input1, const std::vector &input2) : WithInputBuffers(input1, input2), WithOutputBuffer(input1.size()) {} + + void operator()(cl::sycl::handler &cgh) { + auto A = this->getInputBuffer(0).template get_access(cgh); + auto B = this->getInputBuffer(1).template get_access(cgh); + auto C = this->getOutputBuffer().template get_access(cgh); + + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, + [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] { + // declaration of temp within and outside the scope +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("{\n" + ".decl temp v_type=G type=d num_elts=16 align=GRF\n" + "mov (M1, 16) temp(0, 0)<1> %1(0, 0)<1;1,0>\n" + "mov (M1, 16) %0(0, 0)<1> temp(0, 0)<1;1,0>\n" + "}\n" + ".decl temp v_type=G type=d num_elts=16 align=GRF\n" + "mul (M1, 16) temp(0, 0)<1> %2(0, 0)<1;1,0> %0(0, 0)<1;1,0>\n" + "mov (M1, 16) %0(0, 0)<1> temp(0, 0)<1;1,0>\n" + : "+rw"(C[wiID]) + : "rw"(A[wiID]), "rw"(B[wiID])); +#else + C[wiID] = A[wiID]; + C[wiID] *= B[wiID]; +#endif + }); + } +}; + +int main() { + std::vector inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) { + inputA[i] = i; + inputB[i] = 2; + } + + KernelFunctor<> f(inputA, inputB); + if (!launchInlineASMTest(f)) + return 0; + + auto &C = f.getOutputBufferData(); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) { + if (C[i] != inputA[i] * inputB[i]) { + std::cerr << "At index: " << i << ". "; + std::cerr << C[i] << " != " << inputA[i] * inputB[i] << "\n"; + return 1; + } + } + return 0; +} diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_float_add.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_float_add.cpp new file mode 100644 index 0000000000..c23b084317 --- /dev/null +++ b/SYCL/Basic/feature-tests/inline-asm/asm_float_add.cpp @@ -0,0 +1,59 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include +#include + +using dataType = cl::sycl::cl_double; + +template +struct KernelFunctor : WithInputBuffers, WithOutputBuffer { + KernelFunctor(const std::vector &input1, const std::vector &input2) : WithInputBuffers(input1, input2), WithOutputBuffer(input1.size()) {} + + void operator()(cl::sycl::handler &cgh) { + auto A = this->getInputBuffer(0).template get_access(cgh); + auto B = this->getInputBuffer(1).template get_access(cgh); + auto C = this->getOutputBuffer().template get_access(cgh); + + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("add (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>" + : "=rw"(C[wiID]) + : "rw"(A[wiID]), "rw"(B[wiID])); +#else + C[wiID] = A[wiID] + B[wiID]; +#endif + }); + } +}; + +int main() { + std::vector inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) { + inputA[i] = (double)1 / std::pow(2, i); + inputB[i] = (double)2 / std::pow(2, i); + } + + KernelFunctor<> f(inputA, inputB); + if (!launchInlineASMTest(f)) + return 0; + + auto &C = f.getOutputBufferData(); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) { + if (C[i] != inputA[i] + inputB[i]) { + std::cerr << "At index: " << i << ". "; + std::cerr << C[i] << " != " << inputA[i] + inputB[i] << "\n"; + return 1; + } + } + + return 0; +} diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_float_imm_arg.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_float_imm_arg.cpp new file mode 100644 index 0000000000..c9683cf020 --- /dev/null +++ b/SYCL/Basic/feature-tests/inline-asm/asm_float_imm_arg.cpp @@ -0,0 +1,56 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include +#include + +constexpr double IMM_ARGUMENT = 0.5; +using dataType = cl::sycl::cl_double; + +template +struct KernelFunctor : WithInputBuffers, WithOutputBuffer { + KernelFunctor(const std::vector &input) : WithInputBuffers(input), WithOutputBuffer(input.size()) {} + + void operator()(cl::sycl::handler &cgh) { + auto A = this->getInputBuffer(0).template get_access(cgh); + auto B = this->getOutputBuffer().template get_access(cgh); + + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("mul (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2" + : "=rw"(B[wiID]) + : "rw"(A[wiID]), "rw"(IMM_ARGUMENT)); +#else + B[wiID] = A[wiID] * IMM_ARGUMENT; +#endif + }); + } +}; + +int main() { + std::vector input(DEFAULT_PROBLEM_SIZE); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) + input[i] = (double)1 / std::pow(2, i); + + KernelFunctor<> f(input); + if (!launchInlineASMTest(f)) + return 0; + + auto &B = f.getOutputBufferData(); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) { + if (B[i] != input[i] * IMM_ARGUMENT) { + std::cerr << "At index: " << i << ". "; + std::cerr << B[i] << " != " << input[i] * IMM_ARGUMENT << "\n"; + return 1; + } + } + return 0; +} diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_float_neg.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_float_neg.cpp new file mode 100644 index 0000000000..290b089890 --- /dev/null +++ b/SYCL/Basic/feature-tests/inline-asm/asm_float_neg.cpp @@ -0,0 +1,57 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include + +using dataType = cl::sycl::cl_float; + +template +struct KernelFunctor : WithInputBuffers, WithOutputBuffer { + KernelFunctor(const std::vector &input) : WithInputBuffers(input), WithOutputBuffer(input.size()) {} + + void operator()(cl::sycl::handler &cgh) { + auto A = this->getInputBuffer().template get_access(cgh); + auto B = this->getOutputBuffer().template get_access(cgh); + + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("mov (M1, 8) %0(0, 0)<1> (-)%1(0, 0)<1;1,0>" + : "=rw"(B[wiID]) + : "rw"(A[wiID])); +#else + B[wiID] = -A[wiID]; +#endif + }); + } + + size_t problem_size = 0; +}; + +int main() { + std::vector input(DEFAULT_PROBLEM_SIZE); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) + input[i] = 1.0 / i; + + KernelFunctor<> f(input); + if (!launchInlineASMTest(f)) + return 0; + + auto &R = f.getOutputBufferData(); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) { + if (R[i] != -input[i]) { + std::cerr << "At index: " << i << ". "; + std::cerr << R[i] << " != " << -input[i] << "\n"; + return 1; + } + } + + return 0; +} diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_imm_arg.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_imm_arg.cpp new file mode 100644 index 0000000000..2dba04d117 --- /dev/null +++ b/SYCL/Basic/feature-tests/inline-asm/asm_imm_arg.cpp @@ -0,0 +1,55 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include + +constexpr int CONST_ARGUMENT = 0xabc; +using dataType = cl::sycl::cl_int; + +template +struct KernelFunctor : WithInputBuffers, WithOutputBuffer { + KernelFunctor(const std::vector &input) : WithInputBuffers(input), WithOutputBuffer(input.size()) {} + + void operator()(cl::sycl::handler &cgh) { + auto A = this->getInputBuffer(0).template get_access(cgh); + auto B = this->getOutputBuffer().template get_access(cgh); + + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("add (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2" + : "=rw"(B[wiID]) + : "rw"(A[wiID]), "rw"(CONST_ARGUMENT)); +#else + B[wiID] = A[wiID] + CONST_ARGUMENT; +#endif + }); + } +}; + +int main() { + std::vector input(DEFAULT_PROBLEM_SIZE); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) + input[i] = i; + + KernelFunctor<> f(input); + if (!launchInlineASMTest(f)) + return 0; + + auto &B = f.getOutputBufferData(); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) { + if (B[i] != input[i] + CONST_ARGUMENT) { + std::cerr << "At index: " << i << ". "; + std::cerr << B[i] << " != " << input[i] + CONST_ARGUMENT << "\n"; + return 1; + } + } + return 0; +} diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_mul.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_mul.cpp new file mode 100644 index 0000000000..726abcf787 --- /dev/null +++ b/SYCL/Basic/feature-tests/inline-asm/asm_mul.cpp @@ -0,0 +1,57 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include + +using dataType = cl::sycl::cl_int; + +template +struct KernelFunctor : WithInputBuffers, WithOutputBuffer { + KernelFunctor(const std::vector &input1, const std::vector &input2) : WithInputBuffers(input1, input2), WithOutputBuffer(input1.size()) {} + void operator()(cl::sycl::handler &cgh) { + auto A = this->getInputBuffer(0).template get_access(cgh); + auto B = this->getInputBuffer(1).template get_access(cgh); + auto C = this->getOutputBuffer().template get_access(cgh); + + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("mul (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>" + : "=rw"(C[wiID]) + : "rw"(A[wiID]), "rw"(B[wiID])); +#else + C[wiID] = A[wiID] * B[wiID]; +#endif + }); + } +}; + +int main() { + std::vector inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) { + inputA[i] = i; + inputB[i] = DEFAULT_PROBLEM_SIZE - i; + } + + KernelFunctor<> f(inputA, inputB); + if (!launchInlineASMTest(f)) + return 0; + + auto &C = f.getOutputBufferData(); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) { + if (C[i] != inputA[i] * inputB[i]) { + std::cerr << "At index: " << i << ". "; + std::cerr << C[i] << " != " << inputA[i] * inputB[i] << "\n"; + return 1; + } + } + + return 0; +} diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_multiple_instructions.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_multiple_instructions.cpp new file mode 100644 index 0000000000..e8cf02a529 --- /dev/null +++ b/SYCL/Basic/feature-tests/inline-asm/asm_multiple_instructions.cpp @@ -0,0 +1,59 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include + +using dataType = cl::sycl::cl_int; + +template +struct KernelFunctor : WithInputBuffers, WithOutputBuffer { + KernelFunctor(const std::vector &input1, const std::vector &input2, const std::vector &input3) : WithInputBuffers(input1, input2, input3), WithOutputBuffer(input1.size()) {} + + void operator()(cl::sycl::handler &cgh) { + auto A = this->getInputBuffer(0).template get_access(cgh); + auto B = this->getInputBuffer(1).template get_access(cgh); + auto C = this->getInputBuffer(2).template get_access(cgh); + auto D = this->getOutputBuffer().template get_access(cgh); + + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("{\n" + "add (M1, 8) %1(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>\n" + "add (M1, 8) %1(0, 0)<1> %1(0, 0)<1;1,0> %3(0, 0)<1;1,0>\n" + "mov (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0>\n" + "}\n" + : "=rw"(D[wiID]), "+rw"(A[wiID]) + : "rw"(B[wiID]), "rw"(C[wiID])); +#else + A[wiID] += B[wiID]; + A[wiID] += C[wiID]; + D[wiID] = A[wiID]; +#endif + }); + } +}; + +int main() { + std::vector inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE), inputC(DEFAULT_PROBLEM_SIZE); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) { + inputA[i] = inputB[i] = i; + inputC[i] = DEFAULT_PROBLEM_SIZE - 2 * i; // A[i] + B[i] + C[i] = LIST_SIZE + } + + KernelFunctor<> f(inputA, inputB, inputC); + if (!launchInlineASMTest(f)) + return 0; + + if (verify_all_the_same(f.getOutputBufferData(), (dataType)DEFAULT_PROBLEM_SIZE)) + return 0; + + return 1; +} diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_no_operands.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_no_operands.cpp new file mode 100644 index 0000000000..3a3a919caa --- /dev/null +++ b/SYCL/Basic/feature-tests/inline-asm/asm_no_operands.cpp @@ -0,0 +1,34 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +class no_operands_kernel; + +int main() { + // Creating SYCL queue + cl::sycl::queue Queue; + cl::sycl::device Device = Queue.get_device(); + + if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + // Size of index space for kernel + cl::sycl::range<1> NumOfWorkItems{16}; + + // Submitting command group(work) to queue + Queue.submit([&](cl::sycl::handler &cgh) { + // Executing kernel + cgh.parallel_for( + NumOfWorkItems, [=](cl::sycl::id<1> WIid) [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("barrier"); +#endif + }); + }); +} diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_no_output.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_no_output.cpp new file mode 100644 index 0000000000..ff6c65d48b --- /dev/null +++ b/SYCL/Basic/feature-tests/inline-asm/asm_no_output.cpp @@ -0,0 +1,47 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include + +using dataType = cl::sycl::cl_int; + +template +struct KernelFunctor : WithOutputBuffer { + KernelFunctor(size_t problem_size) : WithOutputBuffer(problem_size) {} + + void operator()(cl::sycl::handler &cgh) { + auto C = this->getOutputBuffer().template get_access(cgh); + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { + volatile int local_var = 47; + local_var += C[0]; +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm volatile("{\n" + ".decl temp v_type=G type=w num_elts=8 align=GRF\n" + "mov (M1,16) temp(0, 0)<1> %0(0,0)<1;1,0>\n" + "}\n" ::"rw"(local_var)); +#else + volatile int temp = 0; + temp = local_var; +#endif + }); + } +}; + +int main() { + KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); + if (!launchInlineASMTest(f)) + return 0; + + if (verify_all_the_same(f.getOutputBufferData(), 0)) + return 0; + + return 1; +} diff --git a/SYCL/Basic/feature-tests/inline-asm/asm_plus_mod.cpp b/SYCL/Basic/feature-tests/inline-asm/asm_plus_mod.cpp new file mode 100644 index 0000000000..f65cda777e --- /dev/null +++ b/SYCL/Basic/feature-tests/inline-asm/asm_plus_mod.cpp @@ -0,0 +1,58 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include +#include + +using dataType = cl::sycl::cl_int; + +template +struct KernelFunctor : WithInputBuffers, WithOutputBuffer { + KernelFunctor(const std::vector &input1, const std::vector &input2) : WithInputBuffers(input1), WithOutputBuffer(input2) {} + + void operator()(cl::sycl::handler &cgh) { + auto A = this->getInputBuffer(0).template get_access(cgh); + auto B = this->getOutputBuffer().template get_access(cgh); + + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("add (M1, 16) %0(0, 0)<1> %0(0, 0)<1;1,0> %1(0, 0)<1;1,0>" + : "+rw"(B[wiID]) + : "rw"(A[wiID])); +#else + B[wiID] += A[wiID]; +#endif + }); + } +}; + +int main() { + std::vector inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE), R(DEFAULT_PROBLEM_SIZE); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) { + inputA[i] = i; + inputB[i] = DEFAULT_PROBLEM_SIZE - i; + R[i] = inputA[i] + inputB[i]; + } + + KernelFunctor<> f(inputA, inputB); + if (!launchInlineASMTest(f)) + return 0; + + auto &B = f.getOutputBufferData(); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) { + if (B[i] != R[i]) { + std::cerr << "At index: " << i << ". "; + std::cerr << B[i] << " != " << R[i] << "\n"; + return 1; + } + } + + return 0; +} diff --git a/SYCL/Basic/feature-tests/inline-asm/include/asmhelper.h b/SYCL/Basic/feature-tests/inline-asm/include/asmhelper.h new file mode 100644 index 0000000000..75585e1611 --- /dev/null +++ b/SYCL/Basic/feature-tests/inline-asm/include/asmhelper.h @@ -0,0 +1,128 @@ +#include + +#include +#include +#include + +constexpr const size_t DEFAULT_PROBLEM_SIZE = 16; + +template +struct WithOutputBuffer { + WithOutputBuffer(size_t size) { + _output_buffer_data.resize(size); + _output_buffer.reset(new cl::sycl::buffer(_output_buffer_data.data(), _output_buffer_data.size())); + } + + WithOutputBuffer(const std::vector &data) { + _output_buffer_data = data; + _output_buffer.reset(new cl::sycl::buffer(_output_buffer_data.data(), _output_buffer_data.size())); + } + + const std::vector &getOutputBufferData() { + // We cannoe access the data until the buffer is still alive + _output_buffer.reset(); + return _output_buffer_data; + } + + size_t getOutputBufferSize() const { + return _output_buffer_data.size(); + } + +protected: + cl::sycl::buffer &getOutputBuffer() { + return *_output_buffer; + } + + // Functor is being passed by-copy into cl::sycl::queue::submit and destroyed + // one more time in there. We need to make sure that buffer is only released + // once. + std::shared_ptr> _output_buffer = nullptr; + std::vector _output_buffer_data; +}; + +template +struct WithInputBuffers { + + template + WithInputBuffers(Args... inputs) { + static_assert(sizeof...(Args) == N, "All input buffers must be initialized"); + constructorHelper<0>(inputs...); + } + + cl::sycl::buffer &getInputBuffer(size_t i = 0) { + return *_input_buffers[i]; + } + +protected: + std::shared_ptr> _input_buffers[N] = {nullptr}; + std::vector _input_buffers_data[N]; + +private: + template + void constructorHelper(const std::vector &data, Args... rest) { + _input_buffers_data[Index] = data; + _input_buffers[Index].reset(new cl::sycl::buffer(_input_buffers_data[Index].data(), _input_buffers_data[Index].size())); + constructorHelper(rest...); + } + + template + void constructorHelper() { + // nothing to do, recursion stop + } +}; + +bool isInlineASMSupported(sycl::device Device) { + + sycl::string_class DriverVersion = Device.get_info(); + sycl::string_class DeviceVendorName = Device.get_info(); + // TODO: query for some extension/capability/whatever once interface is + // defined + if (DeviceVendorName.find("Intel") == sycl::string_class::npos) + return false; + if (DriverVersion.length() < 5) + return false; + if (DriverVersion[2] != '.') + return false; + if (std::stoi(DriverVersion.substr(0, 2), nullptr, 10) < 20 || std::stoi(DriverVersion.substr(3, 2), nullptr, 10) < 12) + return false; + return true; +} + +/// checks if device suppots inline asm feature and launches a test +/// +/// \returns false if test wasn't launched (i.e.was skipped) and true otherwise +template +bool launchInlineASMTest(F &f, bool requires_particular_sg_size = true) { + try { + cl::sycl::queue deviceQueue(cl::sycl::gpu_selector{}); + cl::sycl::device device = deviceQueue.get_device(); + +#if defined(INLINE_ASM) + if (!isInlineASMSupported(device)) { + std::cout << "Skipping test\n"; + return false; + } +#endif + + if (requires_particular_sg_size && !device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return false; + } + + deviceQueue.submit(f).wait(); + } catch (cl::sycl::exception &e) { + std::cerr << "Caught exception: " << e.what() << std::endl; + } + return true; +} + +template +bool verify_all_the_same(const std::vector &input, T reference_value) { + for (int i = 0; i < input.size(); ++i) + if (input[i] != reference_value) { + std::cerr << "At index: " << i << " "; + std::cerr << input[i] << " != " << reference_value << "\n"; + return false; + } + return true; +} diff --git a/SYCL/Basic/feature-tests/inline-asm/letter_example.cpp b/SYCL/Basic/feature-tests/inline-asm/letter_example.cpp new file mode 100644 index 0000000000..22bf26648e --- /dev/null +++ b/SYCL/Basic/feature-tests/inline-asm/letter_example.cpp @@ -0,0 +1,66 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include + +constexpr size_t problem_size = 16; + +class kernel_name; + +int main() { + cl::sycl::queue q; + cl::sycl::device Device = q.get_device(); + + if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + auto ctx = q.get_context(); + int *a = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx); + for (int i = 0; i < problem_size; i++) { + a[i] = i; + } + q.submit([&](cl::sycl::handler &cgh) { + cgh.parallel_for( + cl::sycl::range<1>(problem_size), [=](cl::sycl::id<1> idx) + [[cl::intel_reqd_sub_group_size(16)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + int i = idx[0]; + asm volatile("{\n.decl V52 v_type=G type=d num_elts=16 align=GRF\n" + "svm_gather.4.1 (M1, 16) %0.0 V52.0\n" + "add(M1, 16) V52(0, 0)<1> V52(0, 0)<1; 1, 0> 0x1:w\n" + "svm_scatter.4.1 (M1, 16) %0.0 V52.0\n}" + : + : "rw"(&a[i])); +#else + a[idx[0]]++; +#endif + }); + }).wait(); + + bool currect = true; + for (int i = 0; i < problem_size; i++) { + if (a[i] != (i + 1)) { + currect = false; + std::cerr << "error in a[" << i << "]=" + << a[i] << "!=" << (i + 1) << std::endl; + break; + } + } + + if (!currect) { + std::cerr << "Error" << std::endl; + cl::sycl::free(a, ctx); + return 1; + } + + std::cerr << "Pass" << std::endl; + cl::sycl::free(a, ctx); + return 0; +} diff --git a/SYCL/Basic/feature-tests/inline-asm/malloc_shared_32.cpp b/SYCL/Basic/feature-tests/inline-asm/malloc_shared_32.cpp new file mode 100644 index 0000000000..8f058851c2 --- /dev/null +++ b/SYCL/Basic/feature-tests/inline-asm/malloc_shared_32.cpp @@ -0,0 +1,92 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include + +constexpr size_t problem_size = 32; + +class kernel_name; + +int main() { + cl::sycl::queue q; + + cl::sycl::device Device = q.get_device(); + + if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + + auto ctx = q.get_context(); + int *a = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx); + int *b = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx); + int *c = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx); + for (int i = 0; i < problem_size; i++) { + b[i] = -10; + a[i] = i; + c[i] = i; + } + + q.submit([&](cl::sycl::handler &cgh) { + cgh.parallel_for( + cl::sycl::range<1>(problem_size), + [=](cl::sycl::id<1> idx) + [[cl::intel_reqd_sub_group_size(32)]] { + int i = idx[0]; +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm volatile(R"a( + { + .decl V52 v_type=G type=d num_elts=16 align=GRF + .decl V53 v_type=G type=d num_elts=16 align=GRF + .decl V54 v_type=G type=d num_elts=16 align=GRF + .decl V55 v_type=G type=d num_elts=16 align=GRF + .decl V56 v_type=G type=d num_elts=16 align=GRF + .decl V57 v_type=G type=d num_elts=16 align=GRF + svm_gather.4.1 (M1, 16) %2.0 V54.0 + svm_gather.4.1 (M1, 16) %3.0 V55.0 + svm_gather.4.1 (M1, 16) %4.0 V56.0 + svm_gather.4.1 (M1, 16) %5.0 V57.0 + mul (M1, 16) V52(0,0)<1> V54(0,0)<1;1,0> V56(0,0)<1;1,0> + mul (M1, 16) V53(0,0)<1> V55(0,0)<1;1,0> V57(0,0)<1;1,0> + svm_scatter.4.1 (M1, 16) %0.0 V52.0 + svm_scatter.4.1 (M1, 16) %1.0 V53.0 + } + )a" ::"rw"(&b[i]), + "rw"(&b[i] + 16), "rw"(&a[i]), "rw"(&a[i] + 16), "rw"(&c[i]), + "rw"(&c[i] + 16)); +#else + b[i] = a[i] * c[i]; +#endif + }); + }).wait(); + + bool currect = true; + for (int i = 0; i < problem_size; i++) { + if (b[i] != a[i] * b[i]) { + currect = false; + std::cerr << "error in a[" << i << "]=" + << b[i] << "!=" << a[i] * b[i] << std::endl; + break; + } + } + + if (!currect) { + std::cerr << "Error" << std::endl; + cl::sycl::free(a, ctx); + cl::sycl::free(b, ctx); + cl::sycl::free(c, ctx); + return 1; + } + + std::cerr << "Pass" << std::endl; + cl::sycl::free(a, ctx); + cl::sycl::free(b, ctx); + cl::sycl::free(c, ctx); + return 0; +} diff --git a/SYCL/Basic/feature-tests/inline-asm/malloc_shared_in_out_dif.cpp b/SYCL/Basic/feature-tests/inline-asm/malloc_shared_in_out_dif.cpp new file mode 100644 index 0000000000..a6994bd379 --- /dev/null +++ b/SYCL/Basic/feature-tests/inline-asm/malloc_shared_in_out_dif.cpp @@ -0,0 +1,69 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out + +#include "include/asmhelper.h" +#include +#include + +constexpr size_t problem_size = 100; + +class kernel_name; + +int main() { + cl::sycl::queue q; + + cl::sycl::device Device = q.get_device(); + + if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + + auto ctx = q.get_context(); + int *a = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx); + int *b = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx); + for (int i = 0; i < problem_size; i++) { + b[i] = -1; + a[i] = i; + } + + q.submit([&](cl::sycl::handler &cgh) { + cgh.parallel_for( + cl::sycl::range<1>(problem_size), [=](cl::sycl::id<1> idx) [[cl::intel_reqd_sub_group_size(16)]] { + int i = idx[0]; + volatile int tmp = a[i]; + tmp += 1; +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm volatile(" add (M1, 16) %0(0,0)<1> %0(0,0)<1;1,0> %1(0,0)<1;1,0>" + : "+rw"(b[i]) + : "rw"(tmp)); +#else + b[i] += tmp; +#endif + }); + }).wait(); + + bool currect = true; + for (int i = 0; i < problem_size; i++) { + if (b[i] != a[i]) { + currect = false; + std::cerr << "error in a[" << i << "]=" + << b[i] << "!=" << a[i] << std::endl; + break; + } + } + + if (!currect) { + std::cerr << "Error" << std::endl; + cl::sycl::free(a, ctx); + cl::sycl::free(b, ctx); + return 1; + } + + std::cerr << "Pass" << std::endl; + cl::sycl::free(a, ctx); + cl::sycl::free(b, ctx); + return 0; +} diff --git a/SYCL/Basic/feature-tests/inline-asm/malloc_shared_no_input.cpp b/SYCL/Basic/feature-tests/inline-asm/malloc_shared_no_input.cpp new file mode 100644 index 0000000000..22cd47abd6 --- /dev/null +++ b/SYCL/Basic/feature-tests/inline-asm/malloc_shared_no_input.cpp @@ -0,0 +1,61 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out + +#include "include/asmhelper.h" +#include +#include + +constexpr size_t problem_size = 16; + +class kernel_name; + +int main() { + cl::sycl::queue q; + cl::sycl::device Device = q.get_device(); + + if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + auto ctx = q.get_context(); + int *a = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx); + for (int i = 0; i < problem_size; i++) + a[i] = i; + + q.submit([&](cl::sycl::handler &cgh) { + cgh.parallel_for( + cl::sycl::range<1>(problem_size), [=](cl::sycl::id<1> idx) [[cl::intel_reqd_sub_group_size(16)]] { + int i = idx[0]; +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm volatile("mov (M1, 16) %0(0,0)<1> 0x7:d" + : "=rw"(a[i])); +#else + a[i] = 7; +#endif + }); + }).wait(); + + bool currect = true; + for (int i = 0; i < problem_size; i++) { + if (a[i] != 7) { + currect = false; + std::cerr << "error in a[" << i << "]=" + << a[i] << "!=" << 7 << std::endl; + break; + } + } + + if (!currect) { + std::cerr << "Error" << std::endl; + cl::sycl::free(a, ctx); + return 1; + } + + std::cerr << "Pass" << std::endl; + cl::sycl::free(a, ctx); + return 0; +} diff --git a/SYCL/Basic/fpga_tests/Inputs/fpga_device.cpp b/SYCL/Basic/fpga_tests/Inputs/fpga_device.cpp new file mode 100644 index 0000000000..7a5658c8fe --- /dev/null +++ b/SYCL/Basic/fpga_tests/Inputs/fpga_device.cpp @@ -0,0 +1,24 @@ +//==--------------- fpga_device.cpp - AOT compilation for fpga -------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CL/sycl.hpp" + +using namespace cl::sycl; + +const double big[] = {3, 2, 1, 5, 6, 7}; +void foo(double &result, queue q, int x) { + buffer buf(&result, 1); + buffer big_buf(big, sizeof(big) / sizeof(double)); + q.submit([&](handler &cgh) { + auto acc = buf.get_access(cgh); + auto big_acc = big_buf.get_access(cgh); + cgh.single_task([=]() { + acc[0] = big_acc[x]; + }); + }); +} diff --git a/SYCL/Basic/fpga_tests/Inputs/fpga_host.cpp b/SYCL/Basic/fpga_tests/Inputs/fpga_host.cpp new file mode 100644 index 0000000000..ab24b26c22 --- /dev/null +++ b/SYCL/Basic/fpga_tests/Inputs/fpga_host.cpp @@ -0,0 +1,23 @@ +//==--------------- fpga_host.cpp - AOT compilation for fpga ---------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CL/sycl.hpp" +#include + +using namespace cl::sycl; + +void foo(double &, queue q, int x); + +int main(void) { + queue q(accelerator_selector{}); + + double result; + foo(result, q, 3); + assert(result == 5); + return 0; +} diff --git a/SYCL/Basic/fpga_tests/fpga_aocx.cpp b/SYCL/Basic/fpga_tests/fpga_aocx.cpp new file mode 100644 index 0000000000..a5c8a3d5ce --- /dev/null +++ b/SYCL/Basic/fpga_tests/fpga_aocx.cpp @@ -0,0 +1,24 @@ +//==----- fpga_aocx.cpp - AOT compilation for fpga using aoc with aocx -----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: aoc, accelerator + +/// E2E test for AOCX creation/use/run for FPGA +// Produce an archive with device (AOCX) image +// RUN: %clangxx -fsycl -fintelfpga -fsycl-link=image %S/Inputs/fpga_device.cpp -o %t_image.a +// Produce a host object +// RUN: %clangxx -fsycl -fintelfpga %S/Inputs/fpga_host.cpp -c -o %t.o + +// AOCX with source +// RUN: %clangxx -fsycl -fintelfpga %S/Inputs/fpga_host.cpp %t_image.a -o %t_aocx_src.out +// AOCX with object +// RUN: %clangxx -fsycl -fintelfpga %t.o %t_image.a -o %t_aocx_obj.out +// +// RUN: env SYCL_DEVICE_TYPE=ACC %t_aocx_src.out +// RUN: env SYCL_DEVICE_TYPE=ACC %t_aocx_obj.out +// XFAIL:* diff --git a/SYCL/Basic/fpga_tests/fpga_aocx_win.cpp b/SYCL/Basic/fpga_tests/fpga_aocx_win.cpp new file mode 100644 index 0000000000..675cebebcd --- /dev/null +++ b/SYCL/Basic/fpga_tests/fpga_aocx_win.cpp @@ -0,0 +1,24 @@ +//==--- fpga_aocx_win.cpp - AOT compilation for fpga using aoc with aocx ---==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: aoc, accelerator +// REQUIRES: system-windows + +/// E2E test for AOCX creation/use/run for FPGA +// Produce an archive with device (AOCX) image +// RUN: %clang_cl -fsycl -fintelfpga -fsycl-link=image %S/Inputs/fpga_device.cpp -o %t_image.lib +// Produce a host object +// RUN: %clang_cl -fsycl -fintelfpga -DHOST_PART %S/Inputs/fpga_host.cpp -c -o %t.obj + +// AOCX with source +// RUN: %clang_cl -fsycl -fintelfpga -DHOST_PART %S/Inputs/fpga_host.cpp %t_image.lib -o %t_aocx_src.out +// AOCX with object +// RUN: %clang_cl -fsycl -fintelfpga %t.obj %t_image.lib -o %t_aocx_obj.out +// +// RUN: env SYCL_DEVICE_TYPE=ACC %t_aocx_src.out +// RUN: env SYCL_DEVICE_TYPE=ACC %t_aocx_obj.out diff --git a/SYCL/Basic/fpga_tests/fpga_io_pipes.cpp b/SYCL/Basic/fpga_tests/fpga_io_pipes.cpp new file mode 100644 index 0000000000..9826e3cd93 --- /dev/null +++ b/SYCL/Basic/fpga_tests/fpga_io_pipes.cpp @@ -0,0 +1,134 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +//==------------ fpga_io_pipes.cpp - SYCL FPGA pipes test ------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: accelerator +// XFAIL: accelerator +#include +#include +#include +#include + +#include "io_pipe_def.h" + +// TODO: run is disabled, since no support added in FPGA backend yet. Check +// implementation correctness from CXX and SYCL languages perspective. + +// This test is supposed to be run only on Intel FPGA emulator. Change it when +// we have more experience with IO pipe feature in SYCL. +// The emulator creates files (one for I pipe, another for O pipe) with the +// appropriate naming, where a data flowing through a pipe can be stored. +// So in the test we need to create these files and use them appropriately. +// The name is taken as IO pipe ID. +const size_t InputData = 42; +const std::string InputFileName = "0.txt"; +const std::string OutputFileName = "1.txt"; + +void createInputFile(const std::string &filename) { + std::ofstream Input(filename); + if (Input.is_open()) { + Input << InputData; + Input.close(); + } +} + +int validateOutputFile(const std::string &filename) { + std::ifstream Output(filename); + std::string Line; + std::vector Result; + if (Output.is_open()) { + // In the test we write only one number into the pipe, but a backend might + // have a bug of incorrect interpretetion of capacity of the pipe. In this + // case let's read all the lines of the output file to catch this. + while (std::getline(Output, Line)) + Result.push_back(stoi(Line)); + Output.close(); + } + if (Result.size() != 1 || Result[0] != InputData) { + std::cout << "Result mismatches " << Result[0] << " Vs expected " + << InputData << std::endl; + return -1; + } + + return 0; +} + +// Test for simple non-blocking pipes +int test_io_nb_pipe(cl::sycl::queue Queue) { + createInputFile(InputFileName); + + cl::sycl::buffer writeBuf(1); + Queue.submit([&](cl::sycl::handler &cgh) { + auto write_acc = writeBuf.get_access(cgh); + + cgh.single_task([=]() { + bool SuccessCodeI = false; + do { + write_acc[0] = intelfpga::ethernet_read_pipe::read(SuccessCodeI); + } while (!SuccessCodeI); + bool SuccessCodeO = false; + do { + intelfpga::ethernet_write_pipe::write(write_acc[0], SuccessCodeO); + } while (!SuccessCodeO); + }); + }); + + auto readHostBuffer = writeBuf.get_access(); + if (readHostBuffer[0] != InputData) { + std::cout << "Read from a file mismatches " << readHostBuffer[0] + << " Vs expected " << InputData << std::endl; + + return -1; + } + + return validateOutputFile(OutputFileName); +} + +// Test for simple blocking pipes +int test_io_bl_pipe(cl::sycl::queue Queue) { + createInputFile(InputFileName); + + cl::sycl::buffer writeBuf(1); + Queue.submit([&](cl::sycl::handler &cgh) { + auto write_acc = writeBuf.get_access(cgh); + + cgh.single_task([=]() { + write_acc[0] = intelfpga::ethernet_read_pipe::read(); + intelfpga::ethernet_write_pipe::write(write_acc[0]); + }); + }); + + auto readHostBuffer = writeBuf.get_access(); + if (readHostBuffer[0] != InputData) { + std::cout << "Read from a file mismatches " << readHostBuffer[0] + << " Vs expected " << InputData << std::endl; + + return -1; + } + + return validateOutputFile(OutputFileName); +} + +int main() { + cl::sycl::queue Queue{cl::sycl::intel::fpga_emulator_selector{}}; + + if (!Queue.get_device() + .get_info()) { + std::cout << "SYCL_INTEL_data_flow_pipes not supported, skipping" + << std::endl; + return 0; + } + + // Non-blocking pipes + int Result = test_io_nb_pipe(Queue); + + // Blocking pipes + Result &= test_io_bl_pipe(Queue); + + return Result; +} diff --git a/SYCL/Basic/fpga_tests/fpga_pipes.cpp b/SYCL/Basic/fpga_tests/fpga_pipes.cpp new file mode 100644 index 0000000000..8872f296c9 --- /dev/null +++ b/SYCL/Basic/fpga_tests/fpga_pipes.cpp @@ -0,0 +1,326 @@ +// RUN: %clangxx -fsycl %s -o %t.out -fsycl-targets=%sycl_triple +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// REQUIRES: cpu, gpu, accelerator +//==------------- fpga_pipes.cpp - SYCL FPGA pipes test --------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include +#include +#include + +// Size of an array passing through a pipe +constexpr size_t N = 10; + +// For simple non-blocking pipes with explicit type +class some_nb_pipe; + +// For non-blocking pipes created with namespaces set +namespace some { +class nb_pipe; +} + +// For non-blocking template pipes +template +class templ_nb_pipe; + +// For non-blocking multiple pipes +template +using PipeMulNb = cl::sycl::intel::pipe, int>; + +// For simple blocking pipes with explicit type +class some_bl_pipe; + +// For blocking pipes created with namespaces set +namespace some { +class bl_pipe; +} + +// For blocking template pipes +template +class templ_bl_pipe; + +// For blocking multiple pipes +template +using PipeMulBl = cl::sycl::intel::pipe, int>; + +// Kernel names +template +class writer; +template +class reader; + +// Test for simple non-blocking pipes +template +int test_simple_nb_pipe(cl::sycl::queue Queue) { + int data[] = {0}; + + using Pipe = cl::sycl::intel::pipe; + + cl::sycl::buffer readBuf(data, 1); + Queue.submit([&](cl::sycl::handler &cgh) { + cgh.single_task>([=]() { + bool SuccessCode = false; + do { + Pipe::write(42, SuccessCode); + } while (!SuccessCode); + }); + }); + + cl::sycl::buffer writeBuf(data, 1); + Queue.submit([&](cl::sycl::handler &cgh) { + auto write_acc = writeBuf.get_access(cgh); + + cgh.single_task>([=]() { + bool SuccessCode = false; + do { + write_acc[0] = Pipe::read(SuccessCode); + } while (!SuccessCode); + }); + }); + + auto readHostBuffer = writeBuf.get_access(); + if (readHostBuffer[0] != 42) { + std::cout << "Test: " << TestNumber << "\nResult mismatches " + << readHostBuffer[0] << " Vs expected " << 42 << std::endl; + + return -1; + } + + return 0; +} + +// Test for multiple non-blocking pipes +template +int test_multiple_nb_pipe(cl::sycl::queue Queue) { + int data[] = {0}; + + Queue.submit([&](cl::sycl::handler &cgh) { + cgh.single_task>([=]() { + bool SuccessCode = false; + do { + PipeMulNb<1>::write(19, SuccessCode); + } while (!SuccessCode); + }); + }); + + Queue.submit([&](cl::sycl::handler &cgh) { + cgh.single_task>([=]() { + bool SuccessCode = false; + do { + PipeMulNb<2>::write(23, SuccessCode); + } while (!SuccessCode); + }); + }); + + cl::sycl::buffer writeBuf(data, 1); + Queue.submit([&](cl::sycl::handler &cgh) { + auto write_acc = writeBuf.get_access(cgh); + cgh.single_task>([=]() { + bool SuccessCodeA = false; + int Value = 0; + do { + Value = PipeMulNb<1>::read(SuccessCodeA); + } while (!SuccessCodeA); + write_acc[0] = Value; + bool SuccessCodeB = false; + do { + Value = PipeMulNb<2>::read(SuccessCodeB); + } while (!SuccessCodeB); + write_acc[0] += Value; + }); + }); + + auto readHostBuffer = writeBuf.get_access(); + if (readHostBuffer[0] != 42) { + std::cout << "Test: " << TestNumber << "\nResult mismatches " + << readHostBuffer[0] << " Vs expected " << 42 << std::endl; + + return -1; + } + + return 0; +} + +// Test for array passing through a non-blocking pipe +template +int test_array_th_nb_pipe(cl::sycl::queue Queue) { + int data[N] = {0}; + using AnotherNbPipe = cl::sycl::intel::pipe; + + Queue.submit([&](cl::sycl::handler &cgh) { + cgh.single_task>([=]() { + bool SuccessCode = false; + for (size_t i = 0; i != N; ++i) { + do { + AnotherNbPipe::write(i, SuccessCode); + } while (!SuccessCode); + } + }); + }); + + cl::sycl::buffer writeBuf(data, N); + Queue.submit([&](cl::sycl::handler &cgh) { + auto write_acc = writeBuf.get_access(cgh); + cgh.single_task>([=]() { + for (size_t i = 0; i != N; ++i) { + bool SuccessCode = false; + do { + write_acc[i] = AnotherNbPipe::read(SuccessCode); + } while (!SuccessCode); + } + }); + }); + + auto readHostBuffer = writeBuf.get_access(); + for (size_t i = 0; i != N; ++i) { + if (readHostBuffer[i] != i) + std::cout << "Test: " << TestNumber << "\nResult mismatches " + << readHostBuffer[i] << " Vs expected " << i << std::endl; + return -1; + } + + return 0; +} + +// Test for simple blocking pipes +template +int test_simple_bl_pipe(cl::sycl::queue Queue) { + int data[] = {0}; + + using Pipe = cl::sycl::intel::pipe; + + cl::sycl::buffer readBuf(data, 1); + Queue.submit([&](cl::sycl::handler &cgh) { + cgh.single_task>([=]() { + Pipe::write(42); + }); + }); + + cl::sycl::buffer writeBuf(data, 1); + Queue.submit([&](cl::sycl::handler &cgh) { + auto write_acc = writeBuf.get_access(cgh); + + cgh.single_task>([=]() { + write_acc[0] = Pipe::read(); + }); + }); + + auto readHostBuffer = writeBuf.get_access(); + if (readHostBuffer[0] != 42) { + std::cout << "Test: " << TestNumber << "\nResult mismatches " + << readHostBuffer[0] << " Vs expected " << 42 << std::endl; + + return -1; + } + + return 0; +} + +// Test for multiple blocking pipes +template +int test_multiple_bl_pipe(cl::sycl::queue Queue) { + int data[] = {0}; + + Queue.submit([&](cl::sycl::handler &cgh) { + cgh.single_task>([=]() { + PipeMulBl<1>::write(19); + }); + }); + + Queue.submit([&](cl::sycl::handler &cgh) { + cgh.single_task>([=]() { + PipeMulBl<2>::write(23); + }); + }); + + cl::sycl::buffer writeBuf(data, 1); + Queue.submit([&](cl::sycl::handler &cgh) { + auto write_acc = writeBuf.get_access(cgh); + cgh.single_task>([=]() { + write_acc[0] = PipeMulBl<1>::read(); + write_acc[0] += PipeMulBl<2>::read(); + }); + }); + + auto readHostBuffer = writeBuf.get_access(); + if (readHostBuffer[0] != 42) { + std::cout << "Test: " << TestNumber << "\nResult mismatches " + << readHostBuffer[0] << " Vs expected " << 42 << std::endl; + + return -1; + } + + return 0; +} + +// Test for array passing through a blocking pipe +template +int test_array_th_bl_pipe(cl::sycl::queue Queue) { + int data[N] = {0}; + using AnotherBlPipe = cl::sycl::intel::pipe; + + Queue.submit([&](cl::sycl::handler &cgh) { + cgh.single_task>([=]() { + for (size_t i = 0; i != N; ++i) + AnotherBlPipe::write(i); + }); + }); + + cl::sycl::buffer writeBuf(data, N); + Queue.submit([&](cl::sycl::handler &cgh) { + auto write_acc = writeBuf.get_access(cgh); + cgh.single_task>([=]() { + for (size_t i = 0; i != N; ++i) + write_acc[i] = AnotherBlPipe::read(); + }); + }); + + auto readHostBuffer = writeBuf.get_access(); + for (size_t i = 0; i != N; ++i) { + if (readHostBuffer[i] != i) + std::cout << "Test: " << TestNumber << "\nResult mismatches " + << readHostBuffer[i] << " Vs expected " << i << std::endl; + return -1; + } + + return 0; +} + +int main() { + cl::sycl::queue Queue; + + if (!Queue.get_device() + .get_info()) { + std::cout << "SYCL_INTEL_data_flow_pipes not supported, skipping" + << std::endl; + return 0; + } + + // Non-blocking pipes + int Result = test_simple_nb_pipe(Queue); + Result &= test_simple_nb_pipe(Queue); + class forward_nb_pipe; + Result &= test_simple_nb_pipe(Queue); + Result &= test_simple_nb_pipe, /*test number*/ 4>(Queue); + Result &= test_multiple_nb_pipe(Queue); + + // Blocking pipes + Result &= test_simple_bl_pipe(Queue); + Result &= test_simple_bl_pipe(Queue); + class forward_bl_pipe; + Result &= test_simple_bl_pipe(Queue); + Result &= test_simple_bl_pipe, /*test number*/ 9>(Queue); + Result &= test_multiple_bl_pipe(Queue); + + // Test for an array data passing through a pipe + Result &= test_array_th_nb_pipe(Queue); + Result &= test_array_th_bl_pipe(Queue); + + return Result; +} diff --git a/SYCL/Basic/fpga_tests/fpga_pipes_legacy_ns.cpp b/SYCL/Basic/fpga_tests/fpga_pipes_legacy_ns.cpp new file mode 100644 index 0000000000..254103fdf5 --- /dev/null +++ b/SYCL/Basic/fpga_tests/fpga_pipes_legacy_ns.cpp @@ -0,0 +1,63 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// REQUIRES: accelerator +// XFAIL:* +//==-------- fpga_pipes_legacy_ns.cpp - SYCL FPGA pipes test ---------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include +#include + +class some_nb_pipe; + +// Test for simple non-blocking pipes in legacy namespace (cl::sycl::) +template +int test_simple_nb_pipe(cl::sycl::queue Queue) { + int data[] = {0}; + + using Pipe = cl::sycl::pipe; + + cl::sycl::buffer readBuf(data, 1); + Queue.submit([&](cl::sycl::handler &cgh) { + cgh.single_task([=]() { + bool SuccessCode = false; + do { + Pipe::write(42, SuccessCode); + } while (!SuccessCode); + }); + }); + + cl::sycl::buffer writeBuf(data, 1); + Queue.submit([&](cl::sycl::handler &cgh) { + auto write_acc = writeBuf.get_access(cgh); + + cgh.single_task([=]() { + bool SuccessCode = false; + do { + write_acc[0] = Pipe::read(SuccessCode); + } while (!SuccessCode); + }); + }); + + auto readHostBuffer = writeBuf.get_access(); + if (readHostBuffer[0] != 42) { + std::cout <<"Result mismatches " << readHostBuffer[0] << " Vs expected " + << 42 << std::endl; + + return -1; + } + + return 0; +} + + +int main() { + cl::sycl::queue Queue; + + // Non-blocking pipes + return test_simple_nb_pipe(Queue); +} diff --git a/SYCL/Basic/fpga_tests/fpga_queue.cpp b/SYCL/Basic/fpga_tests/fpga_queue.cpp new file mode 100644 index 0000000000..01d2e3cf08 --- /dev/null +++ b/SYCL/Basic/fpga_tests/fpga_queue.cpp @@ -0,0 +1,168 @@ +// REQUIRES: opencl + +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -L %opencl_libs_dir -lOpenCL +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +//==------------- fpga_queue.cpp - SYCL FPGA queues test -------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include +#include +#include + +using namespace cl::sycl; + +const int dataSize = 32; +const int maxNumQueues = 256; + +void GetCLQueue(event sycl_event, std::set& cl_queues) { + try { + cl_command_queue cl_queue; + cl_event cl_event = sycl_event.get(); + cl_int error = clGetEventInfo(cl_event, CL_EVENT_COMMAND_QUEUE, + sizeof(cl_queue), &cl_queue, nullptr); + assert(CL_SUCCESS == error && "Failed to obtain queue from OpenCL event"); + + cl_queues.insert(cl_queue); + } catch (invalid_object_error e) { + std::cout << "Failed to get OpenCL queue from SYCL event: " << e.what() + << std::endl; + } +} + +int getExpectedQueueNumber(cl_device_id device_id, int default_value) { + cl_command_queue_properties reportedProps; + cl_int iRet = clGetDeviceInfo(device_id, + CL_DEVICE_QUEUE_ON_HOST_PROPERTIES, + sizeof(reportedProps), + &reportedProps, + NULL); + assert(CL_SUCCESS == iRet && "Failed to obtain queue info from ocl device"); + return (reportedProps & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) + ? 1 : default_value; +} + +int main() { + int data[dataSize] = {0}; + + { + queue Queue; + std::set cl_queues; + event sycl_event; + + // Purpose of this test is to check how many OpenCL queues are being + // created from 1 SYCL queue for FPGA device. For that we submit 3 kernels + // expecting 3 OpenCL queues created as a result. + buffer bufA (data, range<1>(dataSize)); + buffer bufB (data, range<1>(dataSize)); + buffer bufC (data, range<1>(dataSize)); + + sycl_event = Queue.submit([&](handler& cgh) { + auto writeBuffer = bufA.get_access(cgh); + + // Create a range. + auto myRange = range<1>(dataSize); + + // Create a kernel. + auto myKernel = ([=](id<1> idx) { + writeBuffer[idx] = idx[0]; + }); + + cgh.parallel_for(myRange, myKernel); + }); + GetCLQueue(sycl_event, cl_queues); + + sycl_event = Queue.submit([&](handler& cgh) { + auto writeBuffer = bufB.get_access(cgh); + + // Create a range. + auto myRange = range<1>(dataSize); + + // Create a kernel. + auto myKernel = ([=](id<1> idx) { + writeBuffer[idx] = idx[0]; + }); + + cgh.parallel_for(myRange, myKernel); + }); + GetCLQueue(sycl_event, cl_queues); + + sycl_event = Queue.submit([&](handler& cgh) { + auto readBufferA = bufA.get_access(cgh); + auto readBufferB = bufB.get_access(cgh); + auto writeBuffer = bufC.get_access(cgh); + + // Create a range. + auto myRange = range<1>(dataSize); + + // Create a kernel. + auto myKernel = ([=](id<1> idx) { + writeBuffer[idx] = readBufferA[idx] + readBufferB[idx]; + }); + + cgh.parallel_for(myRange, myKernel); + }); + GetCLQueue(sycl_event, cl_queues); + + int result = cl_queues.size(); + device dev = Queue.get_device(); + int expected_result = dev.is_host() ? 0 : getExpectedQueueNumber(dev.get(), 3); + + if (expected_result != result) { + std::cout << "Result Num of queues = " << result << std::endl + << "Expected Num of queues = "<< expected_result << std::endl; + + return -1; + } + + auto readBufferC = bufC.get_access(); + for (size_t i = 0; i != dataSize; ++i) { + if (readBufferC[i] != 2 * i) { + std::cout << "Result mismatches " << readBufferC[i] << " Vs expected " + << 2 * i << " for index " << i << std::endl; + } + } + } + + { + queue Queue; + std::set cl_queues; + event sycl_event; + + // Check limits of OpenCL queues creation for accelerator device. + buffer buf (&data[0], range<1>(1)); + + for (size_t i = 0; i != maxNumQueues + 1; ++i) { + sycl_event = Queue.submit([&](handler& cgh) { + auto Buffer = buf.get_access(cgh); + + // Create a kernel. + auto myKernel = ([=]() { + Buffer[0] = 0; + }); + + cgh.single_task(myKernel); + }); + GetCLQueue(sycl_event, cl_queues); + } + + int result = cl_queues.size(); + device dev = Queue.get_device(); + int expected_result = dev.is_host() ? 0 : getExpectedQueueNumber(dev.get(), maxNumQueues); + + if (expected_result != result) { + std::cout << "Result Num of queues = " << result << std::endl + << "Expected Num of queues = " << expected_result << std::endl; + + return -1; + } + } + + return 0; +} diff --git a/SYCL/Basic/fpga_tests/global_fpga_device_selector.cpp b/SYCL/Basic/fpga_tests/global_fpga_device_selector.cpp new file mode 100644 index 0000000000..ee387a3710 --- /dev/null +++ b/SYCL/Basic/fpga_tests/global_fpga_device_selector.cpp @@ -0,0 +1,18 @@ +// REQUIRES: aoc, accelerator + +// RUN: %clangxx -fsycl -fintelfpga -std=c++17 %s -o %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// XFAIL:* + +#include +#include + +// Check that FPGA emulator device is found if we try to initialize inline global +// variable using fpga_emulator_selector parameter. + +inline cl::sycl::queue fpga_emu_queue_inlined{ + cl::sycl::intel::fpga_emulator_selector{}}; + +int main() { + return 0; +} diff --git a/SYCL/Basic/fpga_tests/io_pipe_def.h b/SYCL/Basic/fpga_tests/io_pipe_def.h new file mode 100644 index 0000000000..bbfa2f3a0a --- /dev/null +++ b/SYCL/Basic/fpga_tests/io_pipe_def.h @@ -0,0 +1,12 @@ +#include + +namespace intelfpga { +template struct ethernet_pipe_id { + static constexpr unsigned id = ID; +}; + +using ethernet_read_pipe = + sycl::intel::kernel_readable_io_pipe, int, 0>; +using ethernet_write_pipe = + sycl::intel::kernel_writeable_io_pipe, int, 0>; +} // namespace intelfpga diff --git a/SYCL/Basic/fpga_tests/pipes_info.cpp b/SYCL/Basic/fpga_tests/pipes_info.cpp new file mode 100644 index 0000000000..58180c50bc --- /dev/null +++ b/SYCL/Basic/fpga_tests/pipes_info.cpp @@ -0,0 +1,36 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +//==--------- pipes_info.cpp - SYCL device pipe info test --*- C++ -*-------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +int main() { + cl::sycl::queue Queue; + cl::sycl::device Device = Queue.get_device(); + cl::sycl::platform Platform = Device.get_platform(); + + // Query if the device supports kernel to kernel pipe feature + bool IsSupported = + Device.get_info(); + + // Query for platform string. We expect only Intel FPGA platforms to support + // SYCL_INTEL_data_flow_pipes extension. + std::string platform_name = + Platform.get_info(); + bool SupposedToBeSupported = + (platform_name == "Intel(R) FPGA Emulation Platform for OpenCL(TM)" || + platform_name == "Intel(R) FPGA SDK for OpenCL(TM)") + ? true + : false; + + return (SupposedToBeSupported != IsSupported); +} diff --git a/SYCL/Basic/functor/kernel_functor.cpp b/SYCL/Basic/functor/kernel_functor.cpp new file mode 100644 index 0000000000..995208c80a --- /dev/null +++ b/SYCL/Basic/functor/kernel_functor.cpp @@ -0,0 +1,180 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -o %t.out %s +// RUN: cd %T +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// REQUIRES: cpu, host, accelerator + +//==--- kernel_functor.cpp - Functors as SYCL kernel test ------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#include + +constexpr auto sycl_read_write = cl::sycl::access::mode::read_write; +constexpr auto sycl_global_buffer = cl::sycl::access::target::global_buffer; + +// Case 1: +// - functor class is defined in an anonymous namespace +// - the '()' operator: +// * does not have parameters (to be used in 'single_task'). +// * has no 'const' qualifier +namespace { +class Functor1 { +public: + Functor1( + int X_, + cl::sycl::accessor &Acc_) + : X(X_), Acc(Acc_) {} + + void operator()() { Acc[0] += X; } + +private: + int X; + cl::sycl::accessor Acc; +}; +} + +// Case 2: +// - functor class is defined in a namespace +// - the '()' operator: +// * does not have parameters (to be used in 'single_task'). +// * has the 'const' qualifier +namespace ns { +class Functor2 { +public: + Functor2( + int X_, + cl::sycl::accessor &Acc_) + : X(X_), Acc(Acc_) {} + + // cl::sycl::accessor's operator [] is const, hence 'const' is possible below + void operator()() const { Acc[0] += X; } + +private: + int X; + cl::sycl::accessor Acc; +}; +} + +// Case 3: +// - functor class is templated and defined in the translation unit scope +// - the '()' operator: +// * has a parameter of type cl::sycl::id<1> (to be used in 'parallel_for'). +// * has no 'const' qualifier +template class TmplFunctor { +public: + TmplFunctor( + T X_, cl::sycl::accessor &Acc_) + : X(X_), Acc(Acc_) {} + + void operator()(cl::sycl::id<1> id) { Acc[id] += X; } + +private: + T X; + cl::sycl::accessor Acc; +}; + +// Case 4: +// - functor class is templated and defined in the translation unit scope +// - the '()' operator: +// * has a parameter of type cl::sycl::id<1> (to be used in 'parallel_for'). +// * has the 'const' qualifier +template class TmplConstFunctor { +public: + TmplConstFunctor( + T X_, cl::sycl::accessor &Acc_) + : X(X_), Acc(Acc_) {} + + void operator()(cl::sycl::id<1> id) const { Acc[id] += X; } + +private: + T X; + cl::sycl::accessor Acc; +}; + +// Exercise non-templated functors in 'single_task'. +int foo(int X) { + int A[] = { 10 }; + { + cl::sycl::queue Q; + cl::sycl::buffer Buf(A, 1); + + Q.submit([&](cl::sycl::handler &cgh) { + auto Acc = Buf.get_access(cgh); + Functor1 F(X, Acc); + + cgh.single_task(F); + }); + Q.submit([&](cl::sycl::handler &cgh) { + auto Acc = Buf.get_access(cgh); + ns::Functor2 F(X, Acc); + + cgh.single_task(F); + }); + Q.submit([&](cl::sycl::handler &cgh) { + auto Acc = Buf.get_access(cgh); + ns::Functor2 F(X, Acc); + + cgh.single_task(F); + }); + } + return A[0]; +} + +#define ARR_LEN(x) sizeof(x) / sizeof(x[0]) + +// Exercise templated functors in 'parallel_for'. +template T bar(T X) { + T A[] = {(T)10, (T)10 }; + { + cl::sycl::queue Q; + cl::sycl::buffer Buf(A, ARR_LEN(A)); + + Q.submit([&](cl::sycl::handler &cgh) { + auto Acc = + Buf.template get_access(cgh); + TmplFunctor F(X, Acc); + + cgh.parallel_for(cl::sycl::range<1>(ARR_LEN(A)), F); + }); + // Spice with lambdas to make sure functors and lambdas work together. + Q.submit([&](cl::sycl::handler &cgh) { + auto Acc = + Buf.template get_access(cgh); + cgh.parallel_for( + cl::sycl::range<1>(ARR_LEN(A)), + [=](cl::sycl::id<1> id) { Acc[id] += X; }); + }); + Q.submit([&](cl::sycl::handler &cgh) { + auto Acc = + Buf.template get_access(cgh); + TmplConstFunctor F(X, Acc); + + cgh.parallel_for(cl::sycl::range<1>(ARR_LEN(A)), F); + }); + } + T res = (T)0; + + for (int i = 0; i < ARR_LEN(A); i++) + res += A[i]; + return res; +} + +int main() { + const int Res1 = foo(10); + const int Res2 = bar(10); + const int Gold1 = 40; + const int Gold2 = 80; + + assert(Res1 == Gold1); + assert(Res2 == Gold2); + + return 0; +} diff --git a/SYCL/Basic/group-algorithm/all_of.cpp b/SYCL/Basic/group-algorithm/all_of.cpp new file mode 100644 index 0000000000..be37442d32 --- /dev/null +++ b/SYCL/Basic/group-algorithm/all_of.cpp @@ -0,0 +1,77 @@ +// UNSUPPORTED: cuda +// OpenCL C 2.x alike work-group functions not yet supported by CUDA. +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include +#include +#include +#include +using namespace sycl; +using namespace sycl::intel; + +template +class all_of_kernel; + +struct GeZero { + bool operator()(int i) const { return i >= 0; } +}; +struct IsEven { + bool operator()(int i) const { return (i % 2) == 0; } +}; +struct LtZero { + bool operator()(int i) const { return i < 0; } +}; + +template +void test(queue q, InputContainer input, OutputContainer output, + Predicate pred) { + typedef class all_of_kernel kernel_name; + size_t N = input.size(); + size_t G = 16; + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + auto in = in_buf.get_access(cgh); + auto out = out_buf.get_access(cgh); + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[0] = all_of(g, pred(in[lid])); + out[1] = all_of(g, in[lid], pred); + out[2] = all_of(g, in.get_pointer(), in.get_pointer() + N, pred); + }); + }); + } + bool expected = std::all_of(input.begin(), input.end(), pred); + assert(output[0] == expected); + assert(output[1] == expected); + assert(output[2] == expected); +} + +int main() { + queue q; + std::string version = q.get_device().get_info(); + if (version < std::string("2.0")) { + std::cout << "Skipping test\n"; + return 0; + } + + constexpr int N = 32; + std::array input; + std::array output; + std::iota(input.begin(), input.end(), 0); + std::fill(output.begin(), output.end(), false); + + test(q, input, output, GeZero()); + test(q, input, output, IsEven()); + test(q, input, output, LtZero()); + + std::cout << "Test passed." << std::endl; +} diff --git a/SYCL/Basic/group-algorithm/any_of.cpp b/SYCL/Basic/group-algorithm/any_of.cpp new file mode 100644 index 0000000000..c9607e9159 --- /dev/null +++ b/SYCL/Basic/group-algorithm/any_of.cpp @@ -0,0 +1,79 @@ +// UNSUPPORTED: cuda +// OpenCL C 2.x alike work-group functions not yet supported by CUDA. +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include +#include +#include +#include +using namespace sycl; +using namespace sycl::intel; + +template +class any_of_kernel; + +struct GeZero { + bool operator()(int i) const { return i >= 0; } +}; +struct IsEven { + bool operator()(int i) const { return (i % 2) == 0; } +}; +struct LtZero { + bool operator()(int i) const { return i < 0; } +}; + +template +void test(queue q, InputContainer input, OutputContainer output, + Predicate pred) { + typedef typename InputContainer::value_type InputT; + typedef typename OutputContainer::value_type OutputT; + typedef class any_of_kernel kernel_name; + size_t N = input.size(); + size_t G = 16; + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + auto in = in_buf.template get_access(cgh); + auto out = out_buf.template get_access(cgh); + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[0] = any_of(g, pred(in[lid])); + out[1] = any_of(g, in[lid], pred); + out[2] = any_of(g, in.get_pointer(), in.get_pointer() + N, pred); + }); + }); + } + bool expected = std::any_of(input.begin(), input.end(), pred); + assert(output[0] == expected); + assert(output[1] == expected); + assert(output[2] == expected); +} + +int main() { + queue q; + std::string version = q.get_device().get_info(); + if (version < std::string("2.0")) { + std::cout << "Skipping test\n"; + return 0; + } + + constexpr int N = 32; + std::array input; + std::array output; + std::iota(input.begin(), input.end(), 0); + std::fill(output.begin(), output.end(), false); + + test(q, input, output, GeZero()); + test(q, input, output, IsEven()); + test(q, input, output, LtZero()); + + std::cout << "Test passed." << std::endl; +} diff --git a/SYCL/Basic/group-algorithm/broadcast.cpp b/SYCL/Basic/group-algorithm/broadcast.cpp new file mode 100644 index 0000000000..387ae8430c --- /dev/null +++ b/SYCL/Basic/group-algorithm/broadcast.cpp @@ -0,0 +1,65 @@ +// UNSUPPORTED: cuda +// OpenCL C 2.x alike work-group functions not yet supported by CUDA. +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include +#include +#include +#include +using namespace sycl; +using namespace sycl::intel; + +class broadcast_kernel; + +template +void test(queue q, InputContainer input, OutputContainer output) { + typedef typename InputContainer::value_type InputT; + typedef typename OutputContainer::value_type OutputT; + typedef class broadcast_kernel kernel_name; + size_t N = input.size(); + size_t G = 4; + range<2> R(G, G); + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + auto in = in_buf.template get_access(cgh); + auto out = out_buf.template get_access(cgh); + cgh.parallel_for(nd_range<2>(R, R), [=](nd_item<2> it) { + group<2> g = it.get_group(); + int lid = it.get_local_linear_id(); + out[0] = broadcast(g, in[lid]); + out[1] = broadcast(g, in[lid], group<2>::id_type(1, 2)); + out[2] = broadcast(g, in[lid], group<2>::linear_id_type(2 * G + 1)); + }); + }); + } + assert(output[0] == input[0]); + assert(output[1] == input[1 * G + 2]); + assert(output[2] == input[2 * G + 1]); +} + +int main() { + queue q; + std::string version = q.get_device().get_info(); + if (version < std::string("2.0")) { + std::cout << "Skipping test\n"; + return 0; + } + + constexpr int N = 16; + std::array input; + std::array output; + std::iota(input.begin(), input.end(), 1); + std::fill(output.begin(), output.end(), false); + + test(q, input, output); + + std::cout << "Test passed." << std::endl; +} diff --git a/SYCL/Basic/group-algorithm/exclusive_scan.cpp b/SYCL/Basic/group-algorithm/exclusive_scan.cpp new file mode 100644 index 0000000000..22d0644355 --- /dev/null +++ b/SYCL/Basic/group-algorithm/exclusive_scan.cpp @@ -0,0 +1,147 @@ +// UNSUPPORTED: cuda +// OpenCL C 2.x alike work-group functions not yet supported by CUDA. +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include +#include +#include +#include +#include +#include +using namespace sycl; +using namespace sycl::intel; + +template +class exclusive_scan_kernel; + +// std::exclusive_scan isn't implemented yet, so use serial implementation +// instead +namespace emu { +template +OutputIterator exclusive_scan(InputIterator first, InputIterator last, + OutputIterator result, T init, + BinaryOperation binary_op) { + T partial = init; + for (InputIterator it = first; it != last; ++it) { + *(result++) = partial; + partial = binary_op(partial, *it); + } + return result; +} +} // namespace emu + +template +void test(queue q, InputContainer input, OutputContainer output, + BinaryOperation binary_op, + typename OutputContainer::value_type identity) { + typedef typename InputContainer::value_type InputT; + typedef typename OutputContainer::value_type OutputT; + typedef class exclusive_scan_kernel kernel_name0; + typedef class exclusive_scan_kernel kernel_name1; + typedef class exclusive_scan_kernel kernel_name2; + typedef class exclusive_scan_kernel kernel_name3; + OutputT init = 42; + size_t N = input.size(); + size_t G = 16; + std::vector expected(N); + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto in = in_buf.template get_access(cgh); + auto out = out_buf.template get_access(cgh); + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[lid] = exclusive_scan(g, in[lid], binary_op); + }); + }); + } + emu::exclusive_scan(input.begin(), input.begin() + G, expected.begin(), + identity, binary_op); + assert(std::equal(output.begin(), output.begin() + G, expected.begin())); + + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto in = in_buf.template get_access(cgh); + auto out = out_buf.template get_access(cgh); + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[lid] = exclusive_scan(g, in[lid], init, binary_op); + }); + }); + } + emu::exclusive_scan(input.begin(), input.begin() + G, expected.begin(), init, + binary_op); + assert(std::equal(output.begin(), output.begin() + G, expected.begin())); + + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto in = in_buf.template get_access(cgh); + auto out = out_buf.template get_access(cgh); + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + exclusive_scan(g, in.get_pointer(), in.get_pointer() + N, + out.get_pointer(), binary_op); + }); + }); + } + emu::exclusive_scan(input.begin(), input.begin() + N, expected.begin(), + identity, binary_op); + assert(std::equal(output.begin(), output.begin() + N, expected.begin())); + + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto in = in_buf.template get_access(cgh); + auto out = out_buf.template get_access(cgh); + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + exclusive_scan(g, in.get_pointer(), in.get_pointer() + N, + out.get_pointer(), init, binary_op); + }); + }); + } + emu::exclusive_scan(input.begin(), input.begin() + N, expected.begin(), init, + binary_op); + assert(std::equal(output.begin(), output.begin() + N, expected.begin())); +} + +int main() { + queue q; + std::string version = q.get_device().get_info(); + if (version < std::string("2.0")) { + std::cout << "Skipping test\n"; + return 0; + } + + constexpr int N = 32; + std::array input; + std::array output; + std::iota(input.begin(), input.end(), 0); + std::fill(output.begin(), output.end(), 0); + +#if __cplusplus >= 201402L + test(q, input, output, plus<>(), 0); + test(q, input, output, minimum<>(), std::numeric_limits::max()); + test(q, input, output, maximum<>(), std::numeric_limits::lowest()); +#endif + test(q, input, output, plus(), 0); + test(q, input, output, minimum(), std::numeric_limits::max()); + test(q, input, output, maximum(), std::numeric_limits::lowest()); + + std::cout << "Test passed." << std::endl; +} diff --git a/SYCL/Basic/group-algorithm/inclusive_scan.cpp b/SYCL/Basic/group-algorithm/inclusive_scan.cpp new file mode 100644 index 0000000000..edea0142ef --- /dev/null +++ b/SYCL/Basic/group-algorithm/inclusive_scan.cpp @@ -0,0 +1,147 @@ +// UNSUPPORTED: cuda +// OpenCL C 2.x alike work-group functions not yet supported by CUDA. +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include +#include +#include +#include +#include +#include +using namespace sycl; +using namespace sycl::intel; + +template +class inclusive_scan_kernel; + +// std::inclusive_scan isn't implemented yet, so use serial implementation +// instead +namespace emu { +template +OutputIterator inclusive_scan(InputIterator first, InputIterator last, + OutputIterator result, BinaryOperation binary_op, + T init) { + T partial = init; + for (InputIterator it = first; it != last; ++it) { + partial = binary_op(partial, *it); + *(result++) = partial; + } + return result; +} +} // namespace emu + +template +void test(queue q, InputContainer input, OutputContainer output, + BinaryOperation binary_op, + typename OutputContainer::value_type identity) { + typedef typename InputContainer::value_type InputT; + typedef typename OutputContainer::value_type OutputT; + typedef class inclusive_scan_kernel kernel_name0; + typedef class inclusive_scan_kernel kernel_name1; + typedef class inclusive_scan_kernel kernel_name2; + typedef class inclusive_scan_kernel kernel_name3; + OutputT init = 42; + size_t N = input.size(); + size_t G = 16; + std::vector expected(N); + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto in = in_buf.template get_access(cgh); + auto out = out_buf.template get_access(cgh); + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[lid] = inclusive_scan(g, in[lid], binary_op); + }); + }); + } + emu::inclusive_scan(input.begin(), input.begin() + G, expected.begin(), + binary_op, identity); + assert(std::equal(output.begin(), output.begin() + G, expected.begin())); + + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto in = in_buf.template get_access(cgh); + auto out = out_buf.template get_access(cgh); + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[lid] = inclusive_scan(g, in[lid], binary_op, init); + }); + }); + } + emu::inclusive_scan(input.begin(), input.begin() + G, expected.begin(), + binary_op, init); + assert(std::equal(output.begin(), output.begin() + G, expected.begin())); + + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto in = in_buf.template get_access(cgh); + auto out = out_buf.template get_access(cgh); + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + inclusive_scan(g, in.get_pointer(), in.get_pointer() + N, + out.get_pointer(), binary_op); + }); + }); + } + emu::inclusive_scan(input.begin(), input.begin() + N, expected.begin(), + binary_op, identity); + assert(std::equal(output.begin(), output.begin() + N, expected.begin())); + + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + auto in = in_buf.template get_access(cgh); + auto out = out_buf.template get_access(cgh); + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + inclusive_scan(g, in.get_pointer(), in.get_pointer() + N, + out.get_pointer(), binary_op, init); + }); + }); + } + emu::inclusive_scan(input.begin(), input.begin() + N, expected.begin(), + binary_op, init); + assert(std::equal(output.begin(), output.begin() + N, expected.begin())); +} + +int main() { + queue q; + std::string version = q.get_device().get_info(); + if (version < std::string("2.0")) { + std::cout << "Skipping test\n"; + return 0; + } + + constexpr int N = 32; + std::array input; + std::array output; + std::iota(input.begin(), input.end(), 0); + std::fill(output.begin(), output.end(), 0); + +#if __cplusplus >= 201402L + test(q, input, output, plus<>(), 0); + test(q, input, output, minimum<>(), std::numeric_limits::max()); + test(q, input, output, maximum<>(), std::numeric_limits::lowest()); +#endif + test(q, input, output, plus(), 0); + test(q, input, output, minimum(), std::numeric_limits::max()); + test(q, input, output, maximum(), std::numeric_limits::lowest()); + + std::cout << "Test passed." << std::endl; +} diff --git a/SYCL/Basic/group-algorithm/leader.cpp b/SYCL/Basic/group-algorithm/leader.cpp new file mode 100644 index 0000000000..f6c645f610 --- /dev/null +++ b/SYCL/Basic/group-algorithm/leader.cpp @@ -0,0 +1,50 @@ +// UNSUPPORTED: cuda +// OpenCL C 2.x alike work-group functions not yet supported by CUDA. +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include +#include +using namespace sycl; +using namespace sycl::intel; + +class leader_kernel; + +void test(queue q) { + typedef class leader_kernel kernel_name; + int out = 0; + size_t G = 4; + + range<2> R(G, G); + { + buffer out_buf(&out, 1); + + q.submit([&](handler &cgh) { + auto out = out_buf.template get_access(cgh); + cgh.parallel_for(nd_range<2>(R, R), [=](nd_item<2> it) { + group<2> g = it.get_group(); + if (leader(g)) { + out[0] += 1; + } + }); + }); + } + assert(out == 1); +} + +int main() { + queue q; + std::string version = q.get_device().get_info(); + if (version < std::string("2.0")) { + std::cout << "Skipping test\n"; + return 0; + } + + test(q); + + std::cout << "Test passed." << std::endl; +} diff --git a/SYCL/Basic/group-algorithm/none_of.cpp b/SYCL/Basic/group-algorithm/none_of.cpp new file mode 100644 index 0000000000..51a68ab9c7 --- /dev/null +++ b/SYCL/Basic/group-algorithm/none_of.cpp @@ -0,0 +1,77 @@ +// UNSUPPORTED: cuda +// OpenCL C 2.x alike work-group functions not yet supported by CUDA. +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include +#include +#include +#include +using namespace sycl; +using namespace sycl::intel; + +template +class none_of_kernel; + +struct GeZero { + bool operator()(int i) const { return i >= 0; } +}; +struct IsEven { + bool operator()(int i) const { return (i % 2) == 0; } +}; +struct LtZero { + bool operator()(int i) const { return i < 0; } +}; + +template +void test(queue q, InputContainer input, OutputContainer output, + Predicate pred) { + typedef class none_of_kernel kernel_name; + size_t N = input.size(); + size_t G = 16; + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + auto in = in_buf.get_access(cgh); + auto out = out_buf.get_access(cgh); + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[0] = none_of(g, pred(in[lid])); + out[1] = none_of(g, in[lid], pred); + out[2] = none_of(g, in.get_pointer(), in.get_pointer() + N, pred); + }); + }); + } + bool expected = std::none_of(input.begin(), input.end(), pred); + assert(output[0] == expected); + assert(output[1] == expected); + assert(output[2] == expected); +} + +int main() { + queue q; + std::string version = q.get_device().get_info(); + if (version < std::string("2.0")) { + std::cout << "Skipping test\n"; + return 0; + } + + constexpr int N = 32; + std::array input; + std::array output; + std::iota(input.begin(), input.end(), 0); + std::fill(output.begin(), output.end(), false); + + test(q, input, output, GeZero()); + test(q, input, output, IsEven()); + test(q, input, output, LtZero()); + + std::cout << "Test passed." << std::endl; +} diff --git a/SYCL/Basic/group-algorithm/reduce.cpp b/SYCL/Basic/group-algorithm/reduce.cpp new file mode 100644 index 0000000000..10a458b019 --- /dev/null +++ b/SYCL/Basic/group-algorithm/reduce.cpp @@ -0,0 +1,85 @@ +// UNSUPPORTED: cuda +// OpenCL C 2.x alike work-group functions not yet supported by CUDA. +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include +#include +#include +#include +#include +using namespace sycl; +using namespace sycl::intel; + +template +class reduce_kernel; + +template +void test(queue q, InputContainer input, OutputContainer output, + BinaryOperation binary_op, + typename OutputContainer::value_type identity) { + typedef typename InputContainer::value_type InputT; + typedef typename OutputContainer::value_type OutputT; + typedef class reduce_kernel kernel_name; + OutputT init = 42; + size_t N = input.size(); + size_t G = 16; + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + auto in = in_buf.template get_access(cgh); + auto out = out_buf.template get_access(cgh); + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[0] = reduce(g, in[lid], binary_op); + out[1] = reduce(g, in[lid], init, binary_op); + out[2] = reduce(g, in.get_pointer(), in.get_pointer() + N, binary_op); + out[3] = + reduce(g, in.get_pointer(), in.get_pointer() + N, init, binary_op); + }); + }); + } + // std::reduce is not implemented yet, so use std::accumulate instead + assert(output[0] == std::accumulate(input.begin(), input.begin() + G, + identity, binary_op)); + assert(output[1] == + std::accumulate(input.begin(), input.begin() + G, init, binary_op)); + assert(output[2] == + std::accumulate(input.begin(), input.end(), identity, binary_op)); + assert(output[3] == + std::accumulate(input.begin(), input.end(), init, binary_op)); +} + +int main() { + queue q; + std::string version = q.get_device().get_info(); + if (version < std::string("2.0")) { + std::cout << "Skipping test\n"; + return 0; + } + + constexpr int N = 32; + std::array input; + std::array output; + std::iota(input.begin(), input.end(), 0); + std::fill(output.begin(), output.end(), 0); + +#if __cplusplus >= 201402L + test(q, input, output, plus<>(), 0); + test(q, input, output, minimum<>(), std::numeric_limits::max()); + test(q, input, output, maximum<>(), std::numeric_limits::lowest()); +#endif + test(q, input, output, plus(), 0); + test(q, input, output, minimum(), std::numeric_limits::max()); + test(q, input, output, maximum(), std::numeric_limits::lowest()); + + std::cout << "Test passed." << std::endl; +} diff --git a/SYCL/Basic/helpers.hpp b/SYCL/Basic/helpers.hpp new file mode 100644 index 0000000000..e5ca8f768f --- /dev/null +++ b/SYCL/Basic/helpers.hpp @@ -0,0 +1,76 @@ +//==------------------- helpers.hpp - test helpers ------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + + +using namespace cl; + +template +class VecPrinter { +public: + VecPrinter(const VecT &Vec) : MVec(Vec) {} + + void print(std::ostream &Out) const { + std::cout << "[ "; + printHelper(Out, MVec); + std::cout << " ]"; + } + + static void print(const VecT &Elem1) { + std::cout << "[ "; + printHelper(std::cout, Elem1); + std::cout << " ]"; + } + +private: + template + static void printHelper(std::ostream &Out, const VecT &Elem1) { + std::cout << (typename VecT::element_type)(Elem1.template swizzle()); + if (Idx + 1 != EndIdx) + std::cout << ", "; + printHelper(Out, Elem1); + } + template <> + static void printHelper(std::ostream &Out, const VecT &Elem1) {} + + VecT MVec; +}; + +template +VecPrinter printableVec(const VecT &Vec) { + return VecPrinter(Vec); +} + +template +std::ostream &operator<<(std::ostream &Out, + const VecPrinter &VecP) { + VecP.print(Out); + return Out; +} + +class TestQueue : public sycl::queue { +public: + TestQueue(const sycl::device_selector &DevSelector, + const sycl::property_list &PropList = {}) + : sycl::queue(DevSelector, + [](sycl::exception_list ExceptionList) { + for (sycl::exception_ptr_class ExceptionPtr : + ExceptionList) { + try { + std::rethrow_exception(ExceptionPtr); + } catch (sycl::exception &E) { + std::cerr << E.what() << std::endl; + } + } + abort(); + }, + PropList) {} + + ~TestQueue() { wait_and_throw(); } +}; diff --git a/SYCL/Basic/host-interop-task/host-task-dependency.cpp b/SYCL/Basic/host-interop-task/host-task-dependency.cpp new file mode 100644 index 0000000000..8432950101 --- /dev/null +++ b/SYCL/Basic/host-interop-task/host-task-dependency.cpp @@ -0,0 +1,200 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out %threads_lib +// RUN: %CPU_RUN_PLACEHOLDER SYCL_PI_TRACE=-1 %t.out 2>&1 %CPU_CHECK_PLACEHOLDER +// RUN: %GPU_RUN_PLACEHOLDER SYCL_PI_TRACE=-1 %t.out 2>&1 %GPU_CHECK_PLACEHOLDER +// RUN: %ACC_RUN_PLACEHOLDER SYCL_PI_TRACE=-1 %t.out 2>&1 %ACC_CHECK_PLACEHOLDER +// +// TODO: Behaviour is unstable for level zero on Windows. Enable when fixed. +// UNSUPPORTED: windows && level0 +// REQUIRES: cpu, gpu, accelerator + +#include +#include +#include +#include +#include + +#include + +namespace S = cl::sycl; + +struct Context { + std::atomic_bool Flag; + S::queue &Queue; + S::buffer Buf1; + S::buffer Buf2; + S::buffer Buf3; + std::mutex Mutex; + std::condition_variable CV; +}; + +void Thread1Fn(Context *Ctx) { + // 0. initialize resulting buffer with apriori wrong result + { + S::accessor + Acc(Ctx->Buf1); + + for (size_t Idx = 0; Idx < Acc.get_count(); ++Idx) + Acc[Idx] = -1; + } + + { + S::accessor + Acc(Ctx->Buf2); + + for (size_t Idx = 0; Idx < Acc.get_count(); ++Idx) + Acc[Idx] = -2; + } + + { + S::accessor + Acc(Ctx->Buf3); + + for (size_t Idx = 0; Idx < Acc.get_count(); ++Idx) + Acc[Idx] = -3; + } + + // 1. submit task writing to buffer 1 + Ctx->Queue.submit([&](S::handler &CGH) { + S::accessor + GeneratorAcc(Ctx->Buf1, CGH); + + auto GeneratorKernel = [GeneratorAcc] { + for (size_t Idx = 0; Idx < GeneratorAcc.get_count(); ++Idx) + GeneratorAcc[Idx] = Idx; + }; + + CGH.single_task(GeneratorKernel); + }); + + // 2. submit host task writing from buf 1 to buf 2 + auto HostTaskEvent = Ctx->Queue.submit([&](S::handler &CGH) { + S::accessor + CopierSrcAcc(Ctx->Buf1, CGH); + S::accessor + CopierDstAcc(Ctx->Buf2, CGH); + + auto CopierHostTask = [CopierSrcAcc, CopierDstAcc, &Ctx] { + for (size_t Idx = 0; Idx < CopierDstAcc.get_count(); ++Idx) + CopierDstAcc[Idx] = CopierSrcAcc[Idx]; + + bool Expected = false; + bool Desired = true; + assert(Ctx->Flag.compare_exchange_strong(Expected, Desired)); + + { + std::lock_guard Lock(Ctx->Mutex); + Ctx->CV.notify_all(); + } + }; + + CGH.codeplay_host_task(CopierHostTask); + }); + + // 3. submit simple task to move data between two buffers + Ctx->Queue.submit([&](S::handler &CGH) { + S::accessor + SrcAcc(Ctx->Buf2, CGH); + S::accessor + DstAcc(Ctx->Buf3, CGH); + + CGH.depends_on(HostTaskEvent); + + auto CopierKernel = [SrcAcc, DstAcc] { + for (size_t Idx = 0; Idx < DstAcc.get_count(); ++Idx) + DstAcc[Idx] = SrcAcc[Idx]; + }; + + CGH.single_task(CopierKernel); + }); + + // 4. check data in buffer #3 + { + S::accessor + Acc(Ctx->Buf3); + + bool Failure = false; + + for (size_t Idx = 0; Idx < Acc.get_count(); ++Idx) { + fprintf(stderr, "Third buffer [%3zu] = %i\n", Idx, Acc[Idx]); + + Failure |= (Acc[Idx] != Idx); + } + + assert(!Failure && "Invalid data in third buffer"); + } +} + +void Thread2Fn(Context *Ctx) { + std::unique_lock Lock(Ctx->Mutex); + + // T2.1. Wait until flag F is set eq true. + Ctx->CV.wait(Lock, [Ctx] { return Ctx->Flag.load(); }); + + assert(Ctx->Flag.load()); +} + +void test() { + auto EH = [](S::exception_list EL) { + for (const std::exception_ptr &E : EL) { + throw E; + } + }; + + S::queue Queue(EH); + + Context Ctx{{false}, Queue, {10}, {10}, {10}, {}, {}}; + + // 0. setup: thread 1 T1: exec smth; thread 2 T2: waits; init flag F = false + auto A1 = std::async(std::launch::async, Thread1Fn, &Ctx); + auto A2 = std::async(std::launch::async, Thread2Fn, &Ctx); + + A1.get(); + A2.get(); + + assert(Ctx.Flag.load()); + + // 3. check via host accessor that buf 2 contains valid data + { + S::accessor + ResultAcc(Ctx.Buf2); + + bool Failure = false; + for (size_t Idx = 0; Idx < ResultAcc.get_count(); ++Idx) { + fprintf(stderr, "Second buffer [%3zu] = %i\n", Idx, ResultAcc[Idx]); + + Failure |= (ResultAcc[Idx] != Idx); + } + + assert(!Failure && "Invalid data in result buffer"); + } +} + +int main() { + test(); + + return 0; +} + +// launch of GeneratorTask kernel +// CHECK:---> piKernelCreate( +// CHECK: GeneratorTask +// CHECK:---> piEnqueueKernelLaunch( +// prepare for host task +// CHECK:---> piEnqueueMemBufferMap( +// launch of CopierTask kernel +// CHECK:---> piKernelCreate( +// CHECK: CopierTask +// CHECK:---> piEnqueueKernelLaunch( +// TODO need to check for piEventsWait as "wait on dependencies of host task". +// At the same time this piEventsWait may occur anywhere after +// piEnqueueMemBufferMap ("prepare for host task"). diff --git a/SYCL/Basic/host-interop-task/host-task-two-queues.cpp b/SYCL/Basic/host-interop-task/host-task-two-queues.cpp new file mode 100644 index 0000000000..08c1aa313e --- /dev/null +++ b/SYCL/Basic/host-interop-task/host-task-two-queues.cpp @@ -0,0 +1,82 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// +// TODO: Flaky fail on Level Zero that is why mark as unsupported temporarily. +// UNSUPPORTED: level0, opencl +// REQUIRES: cpu, gpu, accelerator + +#include +#include + +namespace S = cl::sycl; + +#define WIDTH 5 +#define HEIGHT 5 + +void test() { + auto EH = [](S::exception_list EL) { + for (const std::exception_ptr &E : EL) { + throw E; + } + }; + + S::queue Q1(EH); + S::queue Q2(EH); + + std::vector DataA(WIDTH * HEIGHT, 2); + std::vector DataB(WIDTH * HEIGHT, 3); + std::vector DataC(WIDTH * HEIGHT, 1); + + S::buffer BufA{DataA.data(), S::range<2>{WIDTH, HEIGHT}}; + S::buffer BufB{DataB.data(), S::range<2>{WIDTH, HEIGHT}}; + S::buffer BufC{DataC.data(), S::range<2>{WIDTH, HEIGHT}}; + + auto CG1 = [&](S::handler &CGH) { + auto AccA = BufA.get_access(CGH); + auto AccB = BufB.get_access(CGH); + auto AccC = BufC.get_access(CGH); + auto Kernel = [=](S::nd_item<2> Item) { + size_t W = Item.get_global_id(0); + size_t H = Item.get_global_id(1); + AccC[W][H] += AccA[W][H] * AccB[W][H]; + }; + CGH.parallel_for(S::nd_range<2>({WIDTH, HEIGHT}, {1, 1}), Kernel); + }; + + auto CG2 = [&](S::handler &CGH) { + auto AccA = BufA.get_access(CGH); + auto AccB = BufB.get_access(CGH); + auto AccC = BufC.get_access(CGH); + + CGH.codeplay_host_task([=] { + for (size_t I = 0; I < WIDTH; ++I) + for (size_t J = 0; J < HEIGHT; ++J) { + std::cout << "C[" << I << "][" << J << "] = " << AccC[I][J] + << std::endl; + } + }); + }; + + static const size_t NTIMES = 4; + + for (size_t Idx = 0; Idx < NTIMES; ++Idx) { + Q1.submit(CG1); + Q2.submit(CG2); + Q2.submit(CG1); + Q1.submit(CG2); + } + + Q1.wait_and_throw(); + Q2.wait_and_throw(); + + for (size_t I = 0; I < WIDTH; ++I) + for (size_t J = 0; J < HEIGHT; ++J) + assert(DataC[I * HEIGHT + J] == (1 + 2 * 3 * NTIMES * 2)); +} + +int main(void) { + test(); + return 0; +} diff --git a/SYCL/Basic/lit.cfg.py b/SYCL/Basic/lit.cfg.py new file mode 100644 index 0000000000..968b64f77b --- /dev/null +++ b/SYCL/Basic/lit.cfg.py @@ -0,0 +1,210 @@ +# -*- Python -*- + +import os +import platform +import re +import subprocess +import tempfile +from distutils.spawn import find_executable + +import lit.formats +import lit.util + +from lit.llvm import llvm_config + +# Configuration file for the 'lit' test runner. + +# name: The name of this test suite. +config.name = 'SYCL' + +# testFormat: The test format to use to interpret tests. +# +# For now we require '&&' between commands, until they get globally killed and +# the test runner updated. +config.test_format = lit.formats.ShTest() + +# suffixes: A list of file extensions to treat as test files. +config.suffixes = ['.c', '.cpp'] #add .spv. Currently not clear what to do with those + +config.excludes = ['Inputs'] + +# test_source_root: The root path where tests are located. +config.test_source_root = os.path.dirname(__file__) + +# test_exec_root: The root path where tests should be run. +config.test_exec_root = os.path.join(config.sycl_obj_root, 'test') + +# Propagate some variables from the host environment. +llvm_config.with_system_environment(['PATH', 'OCL_ICD_FILENAME', 'SYCL_DEVICE_ALLOWLIST', 'SYCL_CONFIG_FILE_NAME']) + +config.substitutions.append( ('%clang_cc1', ' ' + config.dpcpp_compiler + ' -cc1 ') ) +config.substitutions.append( ('%clangxx', ' ' + config.dpcpp_compiler) ) +config.substitutions.append( ('%clang_cl', ' ' + config.dpcpp_compiler) ) +config.substitutions.append( ('%clang', ' ' + config.dpcpp_compiler) ) +config.substitutions.append( ('%threads_lib', config.sycl_threads_lib) ) + +llvm_config.with_environment('PATH', config.lit_tools_dir, append_path=True) + +# Configure LD_LIBRARY_PATH or corresponding os-specific alternatives +if platform.system() == "Linux": + config.available_features.add('linux') + llvm_config.with_system_environment('LD_LIBRARY_PATH') + llvm_config.with_environment('LD_LIBRARY_PATH', config.sycl_libs_dir, append_path=True) + +elif platform.system() == "Windows": + config.available_features.add('windows') + llvm_config.with_system_environment('LIB') + llvm_config.with_environment('LIB', config.sycl_libs_dir, append_path=True) + +elif platform.system() == "Darwin": + # FIXME: surely there is a more elegant way to instantiate the Xcode directories. + llvm_config.with_system_environment('CPATH') + llvm_config.with_environment('CPATH', "/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/include/c++/v1", append_path=True) + llvm_config.with_environment('CPATH', "/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include/", append_path=True) + llvm_config.with_environment('DYLD_LIBRARY_PATH', config.sycl_libs_dir) + +llvm_config.with_environment('PATH', config.sycl_tools_dir, append_path=True) + +config.substitutions.append( ('%sycl_libs_dir', config.sycl_libs_dir ) ) +config.substitutions.append( ('%sycl_include', config.sycl_include ) ) +#config.substitutions.append( ('%sycl_source_dir', config.sycl_source_dir) ) +config.substitutions.append( ('%opencl_libs_dir', config.opencl_libs_dir) ) +config.substitutions.append( ('%opencl_include_dir', config.opencl_include_dir) ) +#config.substitutions.append( ('%cuda_toolkit_include', config.cuda_toolkit_include) ) + +llvm_config.use_clang() + +llvm_config.add_tool_substitutions(['llvm-spirv'], [config.sycl_tools_dir]) + +if not config.sycl_be: + config.sycl_be='PI_OPENCL' + +config.substitutions.append( ('%sycl_be', config.sycl_be) ) +lit_config.note("Backend: {BACKEND}".format(BACKEND=config.sycl_be)) + +if config.dump_ir_supported: + config.available_features.add('dump_ir') + +cuda = False +if ( config.sycl_be == "PI_OPENCL" and ( + 'cpu' in config.target_devices.split(',') or + 'gpu' in config.target_devices.split(',') or + 'acc' in config.target_devices.split(','))): + config.available_features.add('opencl') +elif ( config.sycl_be == "PI_CUDA" ): + config.available_features.add('cuda') + cuda = True +elif ( config.sycl_be == "PI_LEVEL0" ): + config.available_features.add('level0') + +# Configure device-specific substitutions based on availability of corresponding +# devices/runtimes + +found_at_least_one_device = False + +host_run_substitute = "true" +host_run_on_linux_substitute = "true " +host_check_substitute = "" +host_check_on_linux_substitute = "" + +if 'host' in config.target_devices.split(','): + found_at_least_one_device = True + lit_config.note("Test HOST device") + host_run_substitute = "env SYCL_DEVICE_TYPE=HOST SYCL_BE={SYCL_BE} ".format(SYCL_BE=config.sycl_be) + host_check_substitute = "| FileCheck %s" + config.available_features.add('host') + if platform.system() == "Linux": + host_run_on_linux_substitute = "env SYCL_DEVICE_TYPE=HOST SYCL_BE={SYCL_BE} ".format(SYCL_BE=config.sycl_be) + host_check_on_linux_substitute = "| FileCheck %s" +else: + lit_config.warning("HOST device not used") + +config.substitutions.append( ('%HOST_RUN_PLACEHOLDER', host_run_substitute) ) +config.substitutions.append( ('%HOST_RUN_ON_LINUX_PLACEHOLDER', host_run_on_linux_substitute) ) +config.substitutions.append( ('%HOST_CHECK_PLACEHOLDER', host_check_substitute) ) +config.substitutions.append( ('%HOST_CHECK_ON_LINUX_PLACEHOLDER', host_check_on_linux_substitute) ) + +cpu_run_substitute = "true" +cpu_run_on_linux_substitute = "true " +cpu_check_substitute = "" +cpu_check_on_linux_substitute = "" + +if 'cpu' in config.target_devices.split(','): + found_at_least_one_device = True + lit_config.note("Test CPU device") + cpu_run_substitute = "env SYCL_DEVICE_TYPE=CPU SYCL_BE={SYCL_BE} ".format(SYCL_BE=config.sycl_be) + cpu_check_substitute = "| FileCheck %s" + config.available_features.add('cpu') + if platform.system() == "Linux": + cpu_run_on_linux_substitute = "env SYCL_DEVICE_TYPE=CPU SYCL_BE={SYCL_BE} ".format(SYCL_BE=config.sycl_be) + cpu_check_on_linux_substitute = "| FileCheck %s" +else: + lit_config.warning("CPU device not used") + +config.substitutions.append( ('%CPU_RUN_PLACEHOLDER', cpu_run_substitute) ) +config.substitutions.append( ('%CPU_RUN_ON_LINUX_PLACEHOLDER', cpu_run_on_linux_substitute) ) +config.substitutions.append( ('%CPU_CHECK_PLACEHOLDER', cpu_check_substitute) ) +config.substitutions.append( ('%CPU_CHECK_ON_LINUX_PLACEHOLDER', cpu_check_on_linux_substitute) ) + +gpu_run_substitute = "true" +gpu_run_on_linux_substitute = "true " +gpu_check_substitute = "" +gpu_check_on_linux_substitute = "" + +if 'gpu' in config.target_devices.split(','): + found_at_least_one_device = True + lit_config.note("Test GPU device") + gpu_run_substitute = " env SYCL_DEVICE_TYPE=GPU SYCL_BE={SYCL_BE} ".format(SYCL_BE=config.sycl_be) + gpu_check_substitute = "| FileCheck %s" + config.available_features.add('gpu') + + if platform.system() == "Linux": + gpu_run_on_linux_substitute = "env SYCL_DEVICE_TYPE=GPU SYCL_BE={SYCL_BE} ".format(SYCL_BE=config.sycl_be) + gpu_check_on_linux_substitute = "| FileCheck %s" +else: + lit_config.warning("GPU device not used") + +config.substitutions.append( ('%GPU_RUN_PLACEHOLDER', gpu_run_substitute) ) +config.substitutions.append( ('%GPU_RUN_ON_LINUX_PLACEHOLDER', gpu_run_on_linux_substitute) ) +config.substitutions.append( ('%GPU_CHECK_PLACEHOLDER', gpu_check_substitute) ) +config.substitutions.append( ('%GPU_CHECK_ON_LINUX_PLACEHOLDER', gpu_check_on_linux_substitute) ) + +acc_run_substitute = "true" +acc_check_substitute = "" +if 'acc' in config.target_devices.split(','): + found_at_least_one_device = True + lit_config.note("Tests accelerator device") + acc_run_substitute = " env SYCL_DEVICE_TYPE=ACC " + acc_check_substitute = "| FileCheck %s" + config.available_features.add('accelerator') +else: + lit_config.warning("Accelerator device not used") +config.substitutions.append( ('%ACC_RUN_PLACEHOLDER', acc_run_substitute) ) +config.substitutions.append( ('%ACC_CHECK_PLACEHOLDER', acc_check_substitute) ) + +if cuda: + config.substitutions.append( ('%sycl_triple', "nvptx64-nvidia-cuda-sycldevice" ) ) +else: + config.substitutions.append( ('%sycl_triple', "spir64-unknown-linux-sycldevice" ) ) + +if find_executable('sycl-ls'): + config.available_features.add('sycl-ls') + +# Device AOT compilation tools aren't part of the SYCL project, +# so they need to be pre-installed on the machine +aot_tools = ["ocloc", "aoc", "opencl-aot"] + +for aot_tool in aot_tools: + if find_executable(aot_tool) is not None: + lit_config.note("Found pre-installed AOT device compiler " + aot_tool) + config.available_features.add(aot_tool) + else: + lit_config.warning("Couldn't find pre-installed AOT device compiler " + aot_tool) + +# Set timeout for test 1 min +try: + import psutil + lit_config.maxIndividualTestTime = 60 +except ImportError: + pass + diff --git a/SYCL/Basic/lit.site.cfg.py.in b/SYCL/Basic/lit.site.cfg.py.in new file mode 100644 index 0000000000..e93c4e7386 --- /dev/null +++ b/SYCL/Basic/lit.site.cfg.py.in @@ -0,0 +1,29 @@ +@LIT_SITE_CFG_IN_HEADER@ + +import sys +import platform + +dpcpp_root_dir=os.path.dirname(os.path.dirname("@CMAKE_CXX_COMPILER@")) + +config.llvm_tools_dir = os.path.join(dpcpp_root_dir, 'bin') +config.lit_tools_dir = os.path.dirname("@TEST_SUITE_LIT@") +config.dump_ir_supported = "@DUMP_IR_SUPPORTED@" if "@DUMP_IR_SUPPORTED@" else False +config.sycl_tools_dir = config.llvm_tools_dir +config.sycl_include = os.path.join(dpcpp_root_dir, 'include', 'sycl') +config.sycl_obj_root = "@CMAKE_CURRENT_BINARY_DIR@" +#config.sycl_source_dir = "@SYCL_SOURCE_DIR@/source" +config.sycl_libs_dir = os.path.join(dpcpp_root_dir, ('bin' if platform.system() == "Windows" else 'lib')) +config.target_triple = "x86_64-unknown-unknown-gnu" +config.host_triple = "x86_64-unknown-unknown-gnu" +config.opencl_libs_dir = config.sycl_libs_dir +config.opencl_include_dir = config.sycl_include +config.target_devices = lit_config.params.get("target_devices", "@SYCL_TARGET_DEVICES@") +config.sycl_be = lit_config.params.get("sycl_be", "@SYCL_BE@") +config.sycl_threads_lib = '@SYCL_THREADS_LIB@' + +config.dpcpp_compiler = "@CMAKE_CXX_COMPILER@" + +import lit.llvm +lit.llvm.initialize(lit_config, config) + +lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg.py") diff --git a/SYCL/Basic/spec_const/spec_const_hw.cpp b/SYCL/Basic/spec_const/spec_const_hw.cpp new file mode 100644 index 0000000000..251f862b40 --- /dev/null +++ b/SYCL/Basic/spec_const/spec_const_hw.cpp @@ -0,0 +1,121 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// UNSUPPORTED: cuda || level0 +// +//==----------- spec_const_hw.cpp ------------------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// The test checks that the specialization constant feature works correctly - +// tool chain processes them correctly and runtime can correctly execute the +// program. + +#include + +#include +#include + +class MyInt32Const; +class MyFloatConst; + +using namespace sycl; + +class KernelAAAi; +class KernelBBBf; + +int val = 10; + +// Fetch a value at runtime. +int get_value() { return val; } + +float foo( + const cl::sycl::experimental::spec_constant &f32) { + return f32; +} + +int main(int argc, char **argv) { + val = argc + 16; + + cl::sycl::queue q(default_selector{}, [](exception_list l) { + for (auto ep : l) { + try { + std::rethrow_exception(ep); + } catch (cl::sycl::exception &e0) { + std::cout << e0.what(); + } catch (std::exception &e1) { + std::cout << e1.what(); + } catch (...) { + std::cout << "*** catch (...)\n"; + } + } + }); + + std::cout << "Running on " << q.get_device().get_info() + << "\n"; + std::cout << "val = " << val << "\n"; + cl::sycl::program program1(q.get_context()); + cl::sycl::program program2(q.get_context()); + + int goldi = (int)get_value(); + // TODO make this floating point once supported by the compiler + float goldf = (float)get_value(); + + cl::sycl::experimental::spec_constant i32 = + program1.set_spec_constant(goldi); + + cl::sycl::experimental::spec_constant f32 = + program2.set_spec_constant(goldf); + + program1.build_with_kernel_type(); + // Use an option (does not matter which exactly) to test different internal + // SYCL RT execution path + program2.build_with_kernel_type("-cl-fast-relaxed-math"); + + std::vector veci(1); + std::vector vecf(1); + try { + cl::sycl::buffer bufi(veci.data(), veci.size()); + cl::sycl::buffer buff(vecf.data(), vecf.size()); + + q.submit([&](cl::sycl::handler &cgh) { + auto acci = bufi.get_access(cgh); + cgh.single_task( + program1.get_kernel(), + [=]() { + acci[0] = i32.get(); + }); + }); + q.submit([&](cl::sycl::handler &cgh) { + auto accf = buff.get_access(cgh); + cgh.single_task( + program2.get_kernel(), + [=]() { + accf[0] = foo(f32); + }); + }); + } catch (cl::sycl::exception &e) { + std::cout << "*** Exception caught: " << e.what() << "\n"; + return 1; + } + bool passed = true; + int vali = veci[0]; + + if (vali != goldi) { + std::cout << "*** ERROR: " << vali << " != " << goldi << "(gold)\n"; + passed = false; + } + int valf = vecf[0]; + + if (valf != goldf) { + std::cout << "*** ERROR: " << valf << " != " << goldf << "(gold)\n"; + passed = false; + } + std::cout << (passed ? "passed\n" : "FAILED\n"); + return passed ? 0 : 1; +} diff --git a/SYCL/Basic/spec_const/spec_const_redefine.cpp b/SYCL/Basic/spec_const/spec_const_redefine.cpp new file mode 100644 index 0000000000..075b33c70c --- /dev/null +++ b/SYCL/Basic/spec_const/spec_const_redefine.cpp @@ -0,0 +1,112 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: env SYCL_PI_TRACE=2 %CPU_RUN_PLACEHOLDER %t.out 2>&1 %CPU_CHECK_PLACEHOLDER +// RUN: env SYCL_PI_TRACE=2 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER +// RUN: env SYCL_PI_TRACE=2 %ACC_RUN_PLACEHOLDER %t.out 2>&1 %ACC_CHECK_PLACEHOLDER +// UNSUPPORTED: cuda || level0 || host || accelerator +// +//==----------- spec_const_redefine.cpp ------------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// The test checks that: +// - a specialization constant can be redifined and correct new value is used +// after redefinition. +// - the program is JITted only once per a unique set of specialization +// constants values. + +#include + +#include +#include + +class SC0; +class SC1; +class KernelAAA; + +using namespace sycl; + +int val = 0; + +// Fetch a value at runtime. +int get_value() { return val; } + +int main(int argc, char **argv) { + val = argc; + + cl::sycl::queue q(default_selector{}, [](exception_list l) { + for (auto ep : l) { + try { + std::rethrow_exception(ep); + } catch (cl::sycl::exception &e0) { + std::cout << e0.what(); + } catch (std::exception &e1) { + std::cout << e1.what(); + } catch (...) { + std::cout << "*** catch (...)\n"; + } + } + }); + + std::cout << "Running on " << q.get_device().get_info() + << "\n"; + bool passed = true; + int x = get_value(); + + const int sc_vals[][2] = { + {1 + x, 2 + x}, + {2 + x, 3 + x}, + {1 + x, 2 + x}, // same as first - program in cache must be used + {2 + x, 3 + x} // same as second - program in cache must be used + }; + constexpr int n_sc_sets = sizeof(sc_vals) / sizeof(sc_vals[0]); + std::vector vec(n_sc_sets); + + for (int i = 0; i < n_sc_sets; i++) { + cl::sycl::program program(q.get_context()); + const int *sc_set = &sc_vals[i][0]; + cl::sycl::experimental::spec_constant sc0 = + program.set_spec_constant(sc_set[0]); + cl::sycl::experimental::spec_constant sc1 = + program.set_spec_constant(sc_set[1]); + + program.build_with_kernel_type(); + + try { + cl::sycl::buffer buf(vec.data(), vec.size()); + + q.submit([&](cl::sycl::handler &cgh) { + auto acc = buf.get_access(cgh); + cgh.single_task( + program.get_kernel(), + [=]() { + acc[i] = sc0.get() + sc1.get(); + }); + }); + } catch (cl::sycl::exception &e) { + std::cout << "*** Exception caught: " << e.what() << "\n"; + return 1; + } + int val = vec[i]; + int gold = sc_set[0] + sc_set[1]; + + std::cout << "val = " << val << " gold = " << gold << "\n"; + + if (val != gold) { + std::cout << "*** ERROR[" << i << "]: " << val << " != " << gold << "(gold)\n"; + passed = false; + } + } + std::cout << (passed ? "passed\n" : "FAILED\n"); + return passed ? 0 : 1; +} + +// --- Check that only two JIT compilation happened: +// CHECK-NOT: ---> piProgramLink +// CHECK: ---> piProgramLink +// CHECK: ---> piProgramLink +// CHECK-NOT: ---> piProgramLink +// --- Check that the test completed with expected results: +// CHECK: passed diff --git a/SYCL/Basic/struct_param/non-standard-layout.cpp b/SYCL/Basic/struct_param/non-standard-layout.cpp new file mode 100644 index 0000000000..f5db9cc0f8 --- /dev/null +++ b/SYCL/Basic/struct_param/non-standard-layout.cpp @@ -0,0 +1,45 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include + +using namespace cl::sycl; + +struct F1 {}; +struct F2 {}; +struct F : F1, F2 { + cl::sycl::cl_char x; +}; + +bool test0() { + F S; + S.x = 0; + F S0; + S0.x = 1; + { + buffer Buf(&S0, range<1>(1)); + queue myQueue; + myQueue.submit([&](handler &cgh) { + auto B = Buf.get_access(cgh); + cgh.single_task([=] { B[0] = S; }); + }); + } + bool Passed = (S.x == S0.x); + + if (!Passed) { + std::cout << "test0 failed" << std::endl; + } + return Passed; +} + +int main() { + + bool Pass = test0(); + + std::cout << "Test " << (Pass ? "passed" : "FAILED") << std::endl; + return Pass ? 0 : 1; + +} diff --git a/SYCL/Basic/struct_param/struct_kernel_param.cpp b/SYCL/Basic/struct_param/struct_kernel_param.cpp new file mode 100644 index 0000000000..9ffe4724ce --- /dev/null +++ b/SYCL/Basic/struct_param/struct_kernel_param.cpp @@ -0,0 +1,137 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +//==-struct_kernel_param.cpp-Checks passing structs as kernel params--------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +using namespace cl::sycl; + +struct MyNestedStruct { + bool operator==(const MyNestedStruct &Rhs) { + return (FldArr[0] == Rhs.FldArr[0] && FldFloat == Rhs.FldFloat); + } + cl::sycl::cl_char FldArr[1]; + cl::sycl::cl_float FldFloat; +}; + +struct MyStruct { + bool operator==(const MyStruct &Rhs) { + return (FldChar == Rhs.FldChar && FldLong == Rhs.FldLong && + FldShort == Rhs.FldShort && FldUint == Rhs.FldUint && + FldStruct == Rhs.FldStruct && + std::equal(std::begin(FldArr), std::end(FldArr), + std::begin(Rhs.FldArr)) && + FldInt == Rhs.FldInt); + } + cl::sycl::cl_char FldChar; + cl::sycl::cl_long FldLong; + cl::sycl::cl_short FldShort; + cl::sycl::cl_uint FldUint; + MyNestedStruct FldStruct; + cl::sycl::cl_short FldArr[3]; + cl::sycl::cl_int FldInt; +}; + +MyStruct GlobS; + +static void printStruct(const MyStruct &S0) { + std::cout << "{ " << (int)S0.FldChar << ", " << S0.FldLong << ", " + << S0.FldShort << ", " << S0.FldUint << " { { " + << (int)S0.FldStruct.FldArr[0] << " }, " << S0.FldStruct.FldFloat + << " }, { " << S0.FldArr[0] << ", " << S0.FldArr[1] << ", " + << S0.FldArr[2] << " }, " << S0.FldInt << " }"; +} + +bool test0() { + MyStruct S = GlobS; + MyStruct S0 = {0}; + { + buffer Buf(&S0, range<1>(1)); + queue myQueue; + myQueue.submit([&](handler &cgh) { + auto B = Buf.get_access(cgh); + cgh.single_task([=] { B[0] = S; }); + }); + } + bool Passed = (S == S0); + + if (!Passed) { + std::cout << "test0 failed" << std::endl; + std::cout << "test0 input:" << std::endl; + printStruct(S); + std::cout << std::endl; + std::cout << "test0 result:\n"; + printStruct(S0); + std::cout << std::endl; + } + return Passed; +} + +bool test1() { + range<3> ice(8, 9, 10); + uint ice2 = 888; + uint result[4] = {0}; + + { + buffer Buffer((unsigned int *)result, range<1>(4)); + queue myQueue; + myQueue.submit([&](handler &cgh) { + auto B = Buffer.get_access(cgh); + cgh.parallel_for(range<1>{4}, [=](id<1> index) { + B[index.get(0)] = index.get(0) > 2 ? ice2 : ice.get(index.get(0)); + }); + }); + } + + bool Passed = true; + + for (unsigned long i = 0; i < 4; ++i) { + if (i <= 2) { + if (result[i] != ice[i]) + Passed = false; + } else { + if (result[i] != ice2) + Passed = false; + } + } + if (!Passed) + std::cout << "test1 failed" << std::endl; + + return Passed; +} + +int main(int argc, char **argv) { + cl::sycl::cl_char PartChar = argc; + cl::sycl::cl_short PartShort = argc << 8; + cl::sycl::cl_int PartInt = argc << 16; + cl::sycl::cl_uint PartUint = argc << 16; + cl::sycl::cl_long PartLong = ((cl::sycl::cl_long)argc) << 32; + cl::sycl::cl_float PartFloat = argc; + + GlobS = {PartChar, + PartLong, + PartShort, + PartUint, + {{PartChar}, PartFloat}, + {PartShort, PartShort, PartShort}, + PartInt}; + + bool Pass = test0() & test1(); + + std::cout << "Test " << (Pass ? "passed" : "FAILED") << std::endl; + return Pass ? 0 : 1; +} diff --git a/SYCL/Basic/sub_group/attributes.cpp b/SYCL/Basic/sub_group/attributes.cpp new file mode 100644 index 0000000000..ac7e655532 --- /dev/null +++ b/SYCL/Basic/sub_group/attributes.cpp @@ -0,0 +1,125 @@ +// UNSUPPORTED: cuda +// CUDA compilation and runtime do not yet support sub-groups. +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +//==------- attributes.cpp - SYCL sub_group attributes test ----*- C++ -*---==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "helper.hpp" + +#include + +#define KERNEL_FUNCTOR_WITH_SIZE(SIZE) \ + class KernelFunctor##SIZE { \ + public: \ + [[cl::intel_reqd_sub_group_size(SIZE)]] void \ + operator()(cl::sycl::nd_item<1> Item) { \ + const auto GID = Item.get_global_id(); \ + } \ + }; + +KERNEL_FUNCTOR_WITH_SIZE(1); +KERNEL_FUNCTOR_WITH_SIZE(2); +KERNEL_FUNCTOR_WITH_SIZE(4); +KERNEL_FUNCTOR_WITH_SIZE(8); +KERNEL_FUNCTOR_WITH_SIZE(16); + +#undef KERNEL_FUNCTOR_WITH_SIZE + +inline uint32_t flp2(uint32_t X) { + X = X | (X >> 1); + X = X | (X >> 2); + X = X | (X >> 4); + X = X | (X >> 8); + X = X | (X >> 16); + return X - (X >> 1); +} + +template inline void submit(cl::sycl::queue &Q) { + Q.submit([](cl::sycl::handler &cgh) { + Fn F; + cgh.parallel_for(cl::sycl::nd_range<1>{64, 16}, F); + }); +} + +int main() { + queue Queue; + device Device = Queue.get_device(); + + // According to specification, this kernel query requires `cl_khr_subgroups` + // or `cl_intel_subgroups`, and also `cl_intel_required_subgroup_size` + if ((!Device.has_extension("cl_intel_subgroups") && + !Device.has_extension("cl_khr_subgroups")) || + !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + + try { + const auto SGSizes = Device.get_info(); + + for (const auto SGSize : SGSizes) { + // Get the previous power of 2 + auto ReqdSize = flp2(SGSize); + + cl::sycl::program Prog(Queue.get_context()); + + // Store the `cl::sycl::kernel` into a vector because `cl::sycl::kernel` + // doesn't have default constructor + cl::sycl::vector_class TheKernel; + + switch (ReqdSize) { + case 16: + Prog.build_with_kernel_type(); + TheKernel.push_back(Prog.get_kernel()); + submit(Queue); + break; + case 8: + Prog.build_with_kernel_type(); + TheKernel.push_back(Prog.get_kernel()); + submit(Queue); + break; + case 4: + Prog.build_with_kernel_type(); + TheKernel.push_back(Prog.get_kernel()); + submit(Queue); + break; + case 2: + Prog.build_with_kernel_type(); + TheKernel.push_back(Prog.get_kernel()); + submit(Queue); + break; + case 1: + Prog.build_with_kernel_type(); + TheKernel.push_back(Prog.get_kernel()); + submit(Queue); + break; + default: + throw feature_not_supported("sub-group size is not supported", + PI_INVALID_OPERATION); + } + + auto Kernel = TheKernel[0]; + + auto Res = Kernel.get_sub_group_info< + cl::sycl::info::kernel_sub_group::compile_sub_group_size>(Device); + + exit_if_not_equal(Res, ReqdSize, "compile_sub_group_size"); + } + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + return 1; + } + + std::cout << "Test passed.\n"; + return 0; +} diff --git a/SYCL/Basic/sub_group/barrier.cpp b/SYCL/Basic/sub_group/barrier.cpp new file mode 100644 index 0000000000..cafe008512 --- /dev/null +++ b/SYCL/Basic/sub_group/barrier.cpp @@ -0,0 +1,90 @@ +// UNSUPPORTED: cuda +// CUDA compilation and runtime do not yet support sub-groups. +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +//==---------- barrier.cpp - SYCL sub_group barrier test -------*- C++ -*---==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "helper.hpp" +#include +#include +#include +template class sycl_subgr; +using namespace cl::sycl; +template void check(queue &Queue, size_t G = 240, size_t L = 60) { + try { + nd_range<1> NdRange(G, L); + std::vector data(G); + std::iota(data.begin(), data.end(), sizeof(T)); + buffer addbuf(data.data(), range<1>(G)); + buffer sgsizebuf(1); + Queue.submit([&](handler &cgh) { + auto addacc = addbuf.template get_access(cgh); + auto sgsizeacc = sgsizebuf.get_access(cgh); + + cgh.parallel_for>(NdRange, [=](nd_item<1> NdItem) { + intel::sub_group SG = NdItem.get_sub_group(); + size_t lid = SG.get_local_id().get(0); + size_t gid = NdItem.get_global_id(0); + size_t SGoff = gid - lid; + + T res = 0; + for (size_t i = 0; i <= lid; i++) { + res += addacc[SGoff + i]; + } + SG.barrier(access::fence_space::global_space); + addacc[gid] = res; + if (NdItem.get_global_id(0) == 0) + sgsizeacc[0] = SG.get_max_local_range()[0]; + }); + }); + auto addacc = addbuf.template get_access(); + auto sgsizeacc = sgsizebuf.get_access(); + + size_t sg_size = sgsizeacc[0]; + int WGid = -1, SGid = 0; + T add = 0; + for (int j = 0; j < G; j++) { + if (j % L % sg_size == 0) { + SGid++; + add = 0; + } + if (j % L == 0) { + WGid++; + SGid = 0; + } + add += j + sizeof(T); + exit_if_not_equal(addacc[j], add, "barrier"); + } + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + exit(1); + } +} +int main() { + queue Queue; + if (!core_sg_supported(Queue.get_device())) { + std::cout << "Skipping test\n"; + return 0; + } + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + if (Queue.get_device().has_extension("cl_khr_fp64")) { + check(Queue); + } + std::cout << "Test passed." << std::endl; + return 0; +} diff --git a/SYCL/Basic/sub_group/broadcast.cpp b/SYCL/Basic/sub_group/broadcast.cpp new file mode 100644 index 0000000000..fba93ee7a2 --- /dev/null +++ b/SYCL/Basic/sub_group/broadcast.cpp @@ -0,0 +1,87 @@ +// UNSUPPORTED: cuda +// CUDA compilation and runtime do not yet support sub-groups. + +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -D SG_GPU %s -o %t_gpu.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t_gpu.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +//==--------- broadcast.cpp - SYCL sub_group broadcast test ----*- C++ -*---==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "helper.hpp" +#include +template +class sycl_subgr; +using namespace cl::sycl; +template +void check(queue &Queue) { + const int G = 240, L = 60; + try { + nd_range<1> NdRange(G, L); + buffer syclbuf(G); + buffer sgsizebuf(1); + Queue.submit([&](handler &cgh) { + auto syclacc = syclbuf.template get_access(cgh); + auto sgsizeacc = sgsizebuf.get_access(cgh); + cgh.parallel_for>(NdRange, [=](nd_item<1> NdItem) { + intel::sub_group SG = NdItem.get_sub_group(); + /*Broadcast GID of element with SGLID == SGID */ + syclacc[NdItem.get_global_id()] = + broadcast(SG, T(NdItem.get_global_id(0)), SG.get_group_id()); + if (NdItem.get_global_id(0) == 0) + sgsizeacc[0] = SG.get_max_local_range()[0]; + }); + }); + auto syclacc = syclbuf.template get_access(); + auto sgsizeacc = sgsizebuf.get_access(); + size_t sg_size = sgsizeacc[0]; + if (sg_size == 0) + sg_size = L; + int WGid = -1, SGid = 0; + for (int j = 0; j < G; j++) { + if (j % L % sg_size == 0) { + SGid++; + } + if (j % L == 0) { + WGid++; + SGid = 0; + } + exit_if_not_equal(syclacc[j], L * WGid + SGid + SGid * sg_size, + "broadcasted value"); + } + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + exit(1); + } +} +int main() { + queue Queue; + if (!core_sg_supported(Queue.get_device())) { + std::cout << "Skipping test\n"; + return 0; + } + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + // broadcast half type is not supported in OCL CPU RT +#ifdef SG_GPU + if (Queue.get_device().has_extension("cl_khr_fp16")) { + check(Queue); + } +#endif + if (Queue.get_device().has_extension("cl_khr_fp64")) { + check(Queue); + } + std::cout << "Test passed." << std::endl; + return 0; +} diff --git a/SYCL/Basic/sub_group/common.cpp b/SYCL/Basic/sub_group/common.cpp new file mode 100644 index 0000000000..b9b526709c --- /dev/null +++ b/SYCL/Basic/sub_group/common.cpp @@ -0,0 +1,93 @@ +// UNSUPPORTED: cuda +// CUDA compilation and runtime do not yet support sub-groups. +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +//==-------------- common.cpp - SYCL sub_group common test -----*- C++ -*---==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "helper.hpp" +#include +using namespace cl::sycl; +struct Data { + unsigned int local_id; + unsigned int local_range; + unsigned int max_local_range; + unsigned int group_id; + unsigned int group_range; + unsigned int uniform_group_range; +}; + +void check(queue &Queue, unsigned int G, unsigned int L) { + + try { + nd_range<1> NdRange(G, L); + buffer syclbuf(G); + + Queue.submit([&](handler &cgh) { + auto syclacc = syclbuf.get_access(cgh); + cgh.parallel_for(NdRange, [=](nd_item<1> NdItem) { + intel::sub_group SG = NdItem.get_sub_group(); + syclacc[NdItem.get_global_id()].local_id = SG.get_local_id().get(0); + syclacc[NdItem.get_global_id()].local_range = + SG.get_local_range().get(0); + syclacc[NdItem.get_global_id()].max_local_range = + SG.get_max_local_range().get(0); + syclacc[NdItem.get_global_id()].group_id = SG.get_group_id().get(0); + syclacc[NdItem.get_global_id()].group_range = SG.get_group_range(); + syclacc[NdItem.get_global_id()].uniform_group_range = + SG.get_uniform_group_range(); + }); + }); + auto syclacc = syclbuf.get_access(); + unsigned int max_sg = get_sg_size(Queue.get_device()); + unsigned int num_sg = L / max_sg + (L % max_sg ? 1 : 0); + for (int j = 0; j < G; j++) { + unsigned int group_id = j % L / max_sg; + unsigned int local_range = + (group_id + 1 == num_sg) ? (L - group_id * max_sg) : max_sg; + exit_if_not_equal(syclacc[j].local_id, j % L % max_sg, "local_id"); + exit_if_not_equal(syclacc[j].local_range, local_range, "local_range"); + // TODO: Currently workgroup size affects this paramater on CPU and does + // not on GPU. Remove if when it is aligned. + if (Queue.get_device().get_info() == + info::device_type::cpu) { + exit_if_not_equal(syclacc[j].max_local_range, std::min(max_sg, L), + "max_local_range"); + } else { + exit_if_not_equal(syclacc[j].max_local_range, max_sg, + "max_local_range"); + } + exit_if_not_equal(syclacc[j].group_id, group_id, "group_id"); + exit_if_not_equal(syclacc[j].group_range, num_sg, "group_range"); + exit_if_not_equal(syclacc[j].uniform_group_range, num_sg, + "uniform_group_range"); + } + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + exit(1); + } +} +int main() { + queue Queue; + if (!core_sg_supported(Queue.get_device())) { + std::cout << "Skipping test\n"; + return 0; + } + + check(Queue, 240, 80); + check(Queue, 8, 4); + check(Queue, 24, 12); + check(Queue, 1024, 256); + std::cout << "Test passed." << std::endl; + return 0; +} diff --git a/SYCL/Basic/sub_group/common_ocl.cpp b/SYCL/Basic/sub_group/common_ocl.cpp new file mode 100644 index 0000000000..fd38c84969 --- /dev/null +++ b/SYCL/Basic/sub_group/common_ocl.cpp @@ -0,0 +1,111 @@ +// REQUIRES: opencl + +// RUN: %clang_cc1 -x cl -cl-std=CL2.0 %S/sg.cl -triple spir64-unknown-unknown -emit-llvm-bc -o %T/kernel_ocl.bc -include opencl-c.h +// RUN: llvm-spirv %T/kernel_ocl.bc -o %T/kernel_ocl.spv +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -L %opencl_libs_dir -lOpenCL +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out %T/kernel_ocl.spv +// RUN: %GPU_RUN_PLACEHOLDER %t.out %T/kernel_ocl.spv +// RUN: %ACC_RUN_PLACEHOLDER %t.out %T/kernel_ocl.spv + +//==--- common_ocl.cpp - basic SG methods in SYCL vs OpenCL ---*- C++ -*---==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "helper.hpp" +#include +#include +#include +#include + +using namespace cl::sycl; +struct Data { + unsigned int local_id; + unsigned int local_range; + unsigned int max_local_range; + unsigned int group_id; + unsigned int group_range; + unsigned int uniform_group_range; +}; + +void check(queue &Queue, const int G, const int L, const char *SpvFile) { + try { + nd_range<1> NdRange(G, L); + buffer oclbuf(G); + buffer syclbuf(G); + + std::ifstream File(SpvFile, std::ios::binary); + if (!File.is_open()) { + std::cerr << std::strerror(errno); + throw compile_program_error("Cannot open SPIRV file\n", PI_INVALID_VALUE); + } + File.seekg(0, std::ios::end); + vector_class Spv(File.tellg()); + File.seekg(0); + File.read(Spv.data(), Spv.size()); + File.close(); + int Err; + cl_program ClProgram = clCreateProgramWithIL(Queue.get_context().get(), + Spv.data(), Spv.size(), &Err); + CHECK_OCL_CODE(Err); + CHECK_OCL_CODE( + clBuildProgram(ClProgram, 0, nullptr, nullptr, nullptr, nullptr)); + program Prog(Queue.get_context(), ClProgram); + Queue.submit([&](handler &cgh) { + auto oclacc = oclbuf.get_access(cgh); + cgh.set_args(oclacc); + cgh.parallel_for(NdRange, Prog.get_kernel("ocl_subgr")); + }); + auto oclacc = oclbuf.get_access(); + + Queue.submit([&](handler &cgh) { + auto syclacc = syclbuf.get_access(cgh); + cgh.parallel_for(NdRange, [=](nd_item<1> NdItem) { + intel::sub_group SG = NdItem.get_sub_group(); + syclacc[NdItem.get_global_id()].local_id = SG.get_local_id().get(0); + syclacc[NdItem.get_global_id()].local_range = + SG.get_local_range().get(0); + syclacc[NdItem.get_global_id()].max_local_range = + SG.get_max_local_range().get(0); + syclacc[NdItem.get_global_id()].group_id = SG.get_group_id().get(0); + syclacc[NdItem.get_global_id()].group_range = SG.get_group_range(); + syclacc[NdItem.get_global_id()].uniform_group_range = + SG.get_uniform_group_range(); + }); + }); + auto syclacc = syclbuf.get_access(); + for (int j = 0; j < G; j++) { + exit_if_not_equal(syclacc[j].local_id, oclacc[j].local_id, "local_id"); + exit_if_not_equal(syclacc[j].local_range, oclacc[j].local_range, + "local_range"); + exit_if_not_equal(syclacc[j].max_local_range, oclacc[j].max_local_range, + "max_local_range"); + exit_if_not_equal(syclacc[j].group_id, oclacc[j].group_id, "group_id"); + exit_if_not_equal(syclacc[j].group_range, oclacc[j].group_range, + "group_range"); + exit_if_not_equal(syclacc[j].uniform_group_range, + oclacc[j].uniform_group_range, "uniform_group_range"); + } + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + exit(1); + } +} +int main(int argc, char **argv) { + queue Queue; + if (!core_sg_supported(Queue.get_device()) || argc != 2) { + std::cout << "Skipping test\n"; + return 0; + } + + check(Queue, 240, 80, argv[1]); + check(Queue, 8, 4, argv[1]); + check(Queue, 24, 12, argv[1]); + check(Queue, 1024, 256, argv[1]); + std::cout << "Test passed." << std::endl; + return 0; +} diff --git a/SYCL/Basic/sub_group/helper.hpp b/SYCL/Basic/sub_group/helper.hpp new file mode 100644 index 0000000000..2476ed999d --- /dev/null +++ b/SYCL/Basic/sub_group/helper.hpp @@ -0,0 +1,157 @@ +//==---------- helper.hpp - SYCL sub_group helper functions ----------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include +#include +#include + +using namespace cl::sycl; + +template struct utils { + static T1 add_vec(const vec &v); + static bool cmp_vec(const vec &v, const vec &r); + static std::string stringify_vec(const vec &v); +}; +template struct utils { + static T2 add_vec(const vec &v) { return v.s0(); } + static bool cmp_vec(const vec &v, const vec &r) { + return v.s0() == r.s0(); + } + static std::string stringify_vec(const vec &v) { + return std::to_string((T2)v.s0()); + } +}; +template struct utils { + static T2 add_vec(const vec &v) { return v.s0() + v.s1(); } + static bool cmp_vec(const vec &v, const vec &r) { + return v.s0() == r.s0() && v.s1() == r.s1(); + } + static std::string stringify_vec(const vec &v) { + return std::string("(") + std::to_string((T2)v.s0()) + ", " + + std::to_string((T2)v.s1()) + " )"; + } +}; +template struct utils { + static T2 add_vec(const vec &v) { + return v.s0() + v.s1() + v.s2() + v.s3(); + } + static bool cmp_vec(const vec &v, const vec &r) { + return v.s0() == r.s0() && v.s1() == r.s1() && v.s2() == r.s2() && + v.s3() == r.s3(); + } + static std::string stringify_vec(const vec &v) { + return std::string("(") + std::to_string((T2)v.s0()) + ", " + + std::to_string((T2)v.s1()) + std::to_string((T2)v.s2()) + ", " + + std::to_string((T2)v.s3()) + " )"; + } +}; +template struct utils { + static T2 add_vec(const vec &v) { + return v.s0() + v.s1() + v.s2() + v.s3() + v.s4() + v.s5() + v.s6() + + v.s7(); + } + static bool cmp_vec(const vec &v, const vec &r) { + return v.s0() == r.s0() && v.s1() == r.s1() && v.s2() == r.s2() && + v.s3() == r.s3() && v.s4() == r.s4() && v.s5() == r.s5() && + v.s6() == r.s6() && v.s7() == r.s7(); + } + static std::string stringify_vec(const vec &v) { + return std::string("(") + std::to_string((T2)v.s0()) + ", " + + std::to_string((T2)v.s1()) + std::to_string((T2)v.s2()) + ", " + + std::to_string((T2)v.s3()) + std::to_string((T2)v.s4()) + ", " + + std::to_string((T2)v.s5()) + std::to_string((T2)v.s6()) + ", " + + std::to_string((T2)v.s7()) + " )"; + } +}; + +template struct utils { + static T2 add_vec(const vec &v) { + return v.s0() + v.s1() + v.s2() + v.s3() + v.s4() + v.s5() + v.s6() + + v.s7() + v.s8() + v.s9() + v.sA() + v.sB() + v.sC() + v.sD() + + v.sE() + v.sF(); + } + static bool cmp_vec(const vec &v, const vec &r) { + return v.s0() == r.s0() && v.s1() == r.s1() && v.s2() == r.s2() && + v.s3() == r.s3() && v.s4() == r.s4() && v.s5() == r.s5() && + v.s6() == r.s6() && v.s7() == r.s7() && v.s8() == r.s8() && + v.s9() == r.s9() && v.sA() == r.sA() && v.sB() == r.sB() && + v.sC() == r.sC() && v.sD() == r.sD() && v.sE() == r.sE() && + v.sF() == r.sF(); + } + static std::string stringify_vec(const vec &v) { + return std::string("(") + std::to_string((T2)v.s0()) + ", " + + std::to_string((T2)v.s1()) + std::to_string((T2)v.s2()) + ", " + + std::to_string((T2)v.s3()) + std::to_string((T2)v.s4()) + ", " + + std::to_string((T2)v.s5()) + std::to_string((T2)v.s6()) + ", " + + std::to_string((T2)v.s7()) + std::to_string((T2)v.s8()) + ", " + + std::to_string((T2)v.s9()) + std::to_string((T2)v.sA()) + ", " + + std::to_string((T2)v.sB()) + std::to_string((T2)v.sC()) + ", " + + std::to_string((T2)v.sE()) + std::to_string((T2)v.sD()) + ", " + + std::to_string((T2)v.sF()) + " )"; + } +}; + +template void exit_if_not_equal(T val, T ref, const char *name) { + if (std::is_floating_point::value) { + if (std::fabs(val - ref) > 0.01) { + std::cout << "Unexpected result for " << name << ": " << (double)val + << " expected value: " << (double)ref << std::endl; + exit(1); + } + } else { + if ((val - ref) != 0) { + std::cout << "Unexpected result for " << name << ": " << (long)val + << " expected value: " << (long)ref << std::endl; + exit(1); + } + } +} + +template <> void exit_if_not_equal(half val, half ref, const char *name) { + int16_t cmp_val = reinterpret_cast(val); + int16_t cmp_ref = reinterpret_cast(ref); + if (std::abs(cmp_val - cmp_ref) > 1) { + std::cout << "Unexpected result for " << name << ": " << (float)val + << " expected value: " << (float)ref << std::endl; + exit(1); + } +} + +template +void exit_if_not_equal_vec(vec val, vec ref, const char *name) { + if (!utils::cmp_vec(ref, val)) { + std::cout << "Unexpected result for " << name << ": " + << utils::stringify_vec(val) + << " expected value: " << utils::stringify_vec(ref) + << std::endl; + + exit(1); + } +} + +/* CPU returns max number of SG, GPU returns max SG size for + * CL_DEVICE_MAX_NUM_SUB_GROUPS device parameter. This function aligns the + * value. + * */ +inline size_t get_sg_size(const device &Device) { + size_t max_num_sg = Device.get_info(); + if (Device.get_info() == info::device_type::cpu) { + size_t max_wg_size = Device.get_info(); + return max_wg_size / max_num_sg; + } + if (Device.get_info() == info::device_type::gpu) { + return max_num_sg; + } + std::cout << "Unexpected deive type" << std::endl; + exit(1); +} + +bool core_sg_supported(const device &Device) { + return (Device.has_extension("cl_khr_subgroups") || + Device.get_info().find(" 2.1") != + string_class::npos); +} diff --git a/SYCL/Basic/sub_group/info.cpp b/SYCL/Basic/sub_group/info.cpp new file mode 100644 index 0000000000..58fc06bd5e --- /dev/null +++ b/SYCL/Basic/sub_group/info.cpp @@ -0,0 +1,93 @@ +// REQUIRES: opencl + +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +//==------------- info.cpp - SYCL sub_group parameters test ----*- C++ -*---==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "helper.hpp" +#include +class kernel_sg; +using namespace cl::sycl; + +int main() { + queue Queue; + device Device = Queue.get_device(); + + /* Basic sub-group functionality is supported as part of cl_khr_subgroups + * extension or as core OpenCL 2.1 feature. */ + if (!core_sg_supported(Device)) { + std::cout << "Skipping test\n"; + return 0; + } + /* Check info::device parameters. */ + Device.get_info(); + Device.get_info(); + /* sub_group_sizes can be quared only of cl_intel_required_subgroup_size + * extention is supported by device*/ + if (Device.has_extension("cl_intel_required_subgroup_size")) + Device.get_info(); + + try { + size_t max_sg_num = get_sg_size(Device); + size_t max_wg_size = Device.get_info(); + program Prog(Queue.get_context()); + /* TODO: replace with pure SYCL code when fixed problem with consumption + * kernels defined using program objects on GPU device + Prog.build_with_kernel_type(); + kernel Kernel = Prog.get_kernel(); + + Queue.submit([&](cl::sycl::handler &cgh) { + cgh.parallel_for( + nd_range<2>(range<2>(50, 40), range<2>(10, 20)), Kernel, + [=](nd_item<2> index) {}); + });*/ + Prog.build_with_source("kernel void " + "kernel_sg(global double* a, global double* b, " + "global double* c) {*a=*b+*c; }\n"); + kernel Kernel = Prog.get_kernel("kernel_sg"); + uint32_t Res = 0; + for (auto r : {range<3>(3, 4, 5), range<3>(1, 1, 1), range<3>(4, 2, 1), + range<3>(32, 3, 4), range<3>(7, 9, 11)}) { + Res = Kernel.get_sub_group_info< + info::kernel_sub_group::max_sub_group_size>(Device, r); + bool Expected = (Res == r.size() || Res == max_sg_num); + exit_if_not_equal(Expected, true, + "max_sub_group_size"); + } + + Res = Kernel.get_sub_group_info< + info::kernel_sub_group::compile_num_sub_groups>(Device); + + /* Sub-group size is not specified in kernel or IL*/ + exit_if_not_equal(Res, 0, "compile_num_sub_groups"); + + // According to specification, this kernel query requires `cl_khr_subgroups` + // or `cl_intel_subgroups` + if ((Device.has_extension("cl_khr_subgroups") || + Device.has_extension("cl_intel_subgroups")) && + Device.has_extension("cl_intel_required_subgroup_size")) { + Res = Kernel.get_sub_group_info< + info::kernel_sub_group::compile_sub_group_size>(Device); + + /* Required sub-group size is not specified in kernel or IL*/ + exit_if_not_equal(Res, 0, "compile_sub_group_size"); + } + + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + return 1; + } + + std::cout << "Test passed.\n"; + return 0; +} diff --git a/SYCL/Basic/sub_group/load_store.cpp b/SYCL/Basic/sub_group/load_store.cpp new file mode 100644 index 0000000000..109ae20336 --- /dev/null +++ b/SYCL/Basic/sub_group/load_store.cpp @@ -0,0 +1,205 @@ +// UNSUPPORTED: cuda +// CUDA compilation and runtime do not yet support sub-groups. +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// +//==----------- load_store.cpp - SYCL sub_group load/store test ------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "helper.hpp" +#include +template class sycl_subgr; + +using namespace cl::sycl; + +template void check(queue &Queue) { + const int G = 1024, L = 64; + try { + nd_range<1> NdRange(G, L); + buffer syclbuf(G); + buffer sgsizebuf(1); + { + auto acc = syclbuf.template get_access(); + for (int i = 0; i < G; i++) { + acc[i] = i; + acc[i] += 0.1; // Check that floating point types are not casted to int + } + } + Queue.submit([&](handler &cgh) { + auto acc = syclbuf.template get_access(cgh); + auto sgsizeacc = sgsizebuf.get_access(cgh); + accessor LocalMem( + {L}, cgh); + cgh.parallel_for>(NdRange, [=](nd_item<1> NdItem) { + intel::sub_group SG = NdItem.get_sub_group(); + if (SG.get_group_id().get(0) % N == 0) { + size_t SGOffset = + SG.get_group_id().get(0) * SG.get_max_local_range().get(0); + size_t WGSGoffset = NdItem.get_group(0) * L + SGOffset; + multi_ptr mp( + &acc[WGSGoffset]); + multi_ptr MPL( + &LocalMem[SGOffset]); + // Add all values in read block + vec v(utils::add_vec(SG.load(mp))); + SG.store(MPL, v); + vec t(utils::add_vec(SG.load(MPL))); + SG.store(mp, t); + } + if (NdItem.get_global_id(0) == 0) + sgsizeacc[0] = SG.get_max_local_range()[0]; + }); + }); + auto acc = syclbuf.template get_access(); + auto sgsizeacc = sgsizebuf.get_access(); + size_t sg_size = sgsizeacc[0]; + int WGid = -1, SGid = 0; + for (int j = 0; j < (G - (sg_size * N)); j++) { + if (j % L % sg_size == 0) { + SGid++; + } + if (j % L == 0) { + WGid++; + SGid = 0; + } + T ref = 0; + if (SGid % N) { + ref = acc[j - (SGid % N) * sg_size]; + } else { + for (int i = 0; i < N; i++) { + ref += (T)(j + i * sg_size) + 0.1; + } + ref *= N; + } + /* There is no defined out-of-range behavior for these functions. */ + if ((SGid + N) * sg_size < L) { + std::string s("Vector<"); + s += std::string(typeid(ref).name()) + std::string(",") + + std::to_string(N) + std::string(">[") + std::to_string(j) + + std::string("]"); + exit_if_not_equal(acc[j], ref, s.c_str()); + } + } + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + exit(1); + } +} +template void check(queue &Queue) { + const int G = 128, L = 64; + try { + nd_range<1> NdRange(G, L); + buffer syclbuf(G); + buffer sgsizebuf(1); + { + auto acc = syclbuf.template get_access(); + for (int i = 0; i < G; i++) { + acc[i] = i; + acc[i] += 0.1; // Check that floating point types are not casted to int + } + } + + Queue.submit([&](handler &cgh) { + auto acc = syclbuf.template get_access(cgh); + auto sgsizeacc = sgsizebuf.get_access(cgh); + accessor LocalMem( + {L}, cgh); + cgh.parallel_for>(NdRange, [=](nd_item<1> NdItem) { + intel::sub_group SG = NdItem.get_sub_group(); + if (NdItem.get_global_id(0) == 0) + sgsizeacc[0] = SG.get_max_local_range()[0]; + size_t SGOffset = + SG.get_group_id().get(0) * SG.get_max_local_range().get(0); + size_t WGSGoffset = NdItem.get_group(0) * L + SGOffset; + multi_ptr mp(&acc[WGSGoffset]); + multi_ptr MPL( + &LocalMem[SGOffset]); + T s = SG.load(mp) + (T)SG.get_local_id().get(0); + SG.store(MPL, s); + T t = SG.load(MPL) + (T)SG.get_local_id().get(0); + SG.store(mp, t); + }); + }); + auto acc = syclbuf.template get_access(); + auto sgsizeacc = sgsizebuf.get_access(); + size_t sg_size = sgsizeacc[0]; + int WGid = -1, SGid = 0; + for (int j = 0; j < G; j++) { + if (j % L % sg_size == 0) { + SGid++; + } + if (j % L == 0) { + WGid++; + SGid = 0; + } + std::string s("Scalar<"); + s += std::string(typeid(acc[j]).name()) + std::string(">[") + + std::to_string(j) + std::string("]"); + + exit_if_not_equal(acc[j], (T)(j + 2 * (j % L % sg_size)) + 0.1, + s.c_str()); + } + + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + exit(1); + } +} + +int main() { + queue Queue; + if (!Queue.get_device().has_extension("cl_intel_subgroups") && + !Queue.get_device().has_extension("cl_intel_subgroups_short")) { + std::cout << "Skipping test\n"; + return 0; + } + if (Queue.get_device().has_extension("cl_intel_subgroups")) { + typedef bool aligned_char __attribute__((aligned(16))); + check(Queue); + typedef int aligned_int __attribute__((aligned(16))); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + typedef unsigned int aligned_uint __attribute__((aligned(16))); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + typedef float aligned_float __attribute__((aligned(16))); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + } + if (Queue.get_device().has_extension("cl_intel_subgroups_short")) { + typedef short aligned_short __attribute__((aligned(16))); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + if (Queue.get_device().has_extension("cl_khr_fp16")) { + typedef half aligned_half __attribute__((aligned(16))); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + } + } + std::cout << "Test passed." << std::endl; + return 0; +} diff --git a/SYCL/Basic/sub_group/reduce.cpp b/SYCL/Basic/sub_group/reduce.cpp new file mode 100644 index 0000000000..03ac01362b --- /dev/null +++ b/SYCL/Basic/sub_group/reduce.cpp @@ -0,0 +1,125 @@ +// UNSUPPORTED: cuda +// CUDA compilation and runtime do not yet support sub-groups. +// +// RUN: %clangxx -fsycl -std=c++14 %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -std=c++14 -D SG_GPU %s -o %t_gpu.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t_gpu.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +//==--------------- reduce.cpp - SYCL sub_group reduce test ----*- C++ -*---==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "helper.hpp" +#include + +template +class sycl_subgr; + +using namespace cl::sycl; + +template +void check_op(queue &Queue, T init, BinaryOperation op, bool skip_init = false, + size_t G = 240, size_t L = 60) { + try { + nd_range<1> NdRange(G, L); + buffer buf(G); + Queue.submit([&](handler &cgh) { + auto acc = buf.template get_access(cgh); + cgh.parallel_for>( + NdRange, [=](nd_item<1> NdItem) { + intel::sub_group sg = NdItem.get_sub_group(); + if (skip_init) { + acc[NdItem.get_global_id(0)] = + reduce(sg, T(NdItem.get_global_id(0)), op); + } else { + acc[NdItem.get_global_id(0)] = + reduce(sg, T(NdItem.get_global_id(0)), init, op); + } + }); + }); + auto acc = buf.template get_access(); + size_t sg_size = get_sg_size(Queue.get_device()); + int WGid = -1, SGid = 0; + T result = init; + for (int j = 0; j < G; j++) { + if (j % L % sg_size == 0) { + SGid++; + result = init; + for (int i = j; (i % L && i % L % sg_size) || (i == j); i++) { + result = op(result, T(i)); + } + } + if (j % L == 0) { + WGid++; + SGid = 0; + } + std::string name = + std::string("reduce_") + typeid(BinaryOperation).name(); + exit_if_not_equal(acc[j], result, name.c_str()); + } + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + exit(1); + } +} + +template +void check(queue &Queue, size_t G = 240, size_t L = 60) { + // limit data range for half to avoid rounding issues + if (std::is_same::value) { + G = 64; + L = 32; + } + + check_op(Queue, T(L), intel::plus(), false, G, L); + check_op(Queue, T(0), intel::plus(), true, G, L); + + check_op(Queue, T(0), intel::minimum(), false, G, L); + check_op(Queue, T(G), intel::minimum(), true, G, L); + + check_op(Queue, T(G), intel::maximum(), false, G, L); + check_op(Queue, T(0), intel::maximum(), true, G, L); + +#if __cplusplus >= 201402L + check_op(Queue, T(L), intel::plus<>(), false, G, L); + check_op(Queue, T(0), intel::plus<>(), true, G, L); + + check_op(Queue, T(0), intel::minimum<>(), false, G, L); + check_op(Queue, T(G), intel::minimum<>(), true, G, L); + + check_op(Queue, T(G), intel::maximum<>(), false, G, L); + check_op(Queue, T(0), intel::maximum<>(), true, G, L); +#endif +} + +int main() { + queue Queue; + if (!core_sg_supported(Queue.get_device())) { + std::cout << "Skipping test\n"; + return 0; + } + + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + // reduce half type is not supported in OCL CPU RT +#ifdef SG_GPU + if (Queue.get_device().has_extension("cl_khr_fp16")) { + check(Queue); + } +#endif + if (Queue.get_device().has_extension("cl_khr_fp64")) { + check(Queue); + } + std::cout << "Test passed." << std::endl; + return 0; +} diff --git a/SYCL/Basic/sub_group/scan.cpp b/SYCL/Basic/sub_group/scan.cpp new file mode 100644 index 0000000000..70a5115cd4 --- /dev/null +++ b/SYCL/Basic/sub_group/scan.cpp @@ -0,0 +1,160 @@ +// UNSUPPORTED: cuda +// CUDA compilation and runtime do not yet support sub-groups. +// +// RUN: %clangxx -fsycl -std=c++14 %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -std=c++14 -D SG_GPU %s -o %t_gpu.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t_gpu.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +//==--------------- scan.cpp - SYCL sub_group scan test --------*- C++ -*---==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "helper.hpp" +#include +#include + +template +class sycl_subgr; + +using namespace cl::sycl; + +template +void check_op(queue &Queue, T init, BinaryOperation op, bool skip_init = false, + size_t G = 120, size_t L = 60) { + try { + nd_range<1> NdRange(G, L); + buffer exbuf(G), inbuf(G); + Queue.submit([&](handler &cgh) { + auto exacc = exbuf.template get_access(cgh); + auto inacc = inbuf.template get_access(cgh); + cgh.parallel_for>( + NdRange, [=](nd_item<1> NdItem) { + intel::sub_group sg = NdItem.get_sub_group(); + if (skip_init) { + exacc[NdItem.get_global_id(0)] = + exclusive_scan(sg, T(NdItem.get_global_id(0)), op); + inacc[NdItem.get_global_id(0)] = + inclusive_scan(sg, T(NdItem.get_global_id(0)), op); + } else { + exacc[NdItem.get_global_id(0)] = + exclusive_scan(sg, T(NdItem.get_global_id(0)), init, op); + inacc[NdItem.get_global_id(0)] = + inclusive_scan(sg, T(NdItem.get_global_id(0)), op, init); + } + }); + }); + auto exacc = exbuf.template get_access(); + auto inacc = inbuf.template get_access(); + size_t sg_size = get_sg_size(Queue.get_device()); + int WGid = -1, SGid = 0; + T result = init; + for (int j = 0; j < G; j++) { + if (j % L % sg_size == 0) { + SGid++; + result = init; + } + if (j % L == 0) { + WGid++; + SGid = 0; + } + std::string exname = + std::string("scan_exc_") + typeid(BinaryOperation).name(); + std::string inname = + std::string("scan_inc_") + typeid(BinaryOperation).name(); + exit_if_not_equal(exacc[j], result, exname.c_str()); + result = op(result, T(j)); + exit_if_not_equal(inacc[j], result, inname.c_str()); + } + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + exit(1); + } +} + +template +void check(queue &Queue, size_t G = 120, size_t L = 60) { + // limit data range for half to avoid rounding issues + if (std::is_same::value) { + G = 64; + L = 32; + } + + check_op(Queue, T(L), intel::plus(), false, G, L); + check_op(Queue, T(0), intel::plus(), true, G, L); + + check_op(Queue, T(0), intel::minimum(), false, G, L); + if (std::is_floating_point::value || + std::is_same::value) { + check_op(Queue, std::numeric_limits::infinity(), intel::minimum(), + true, G, L); + } else { + check_op(Queue, std::numeric_limits::max(), intel::minimum(), true, + G, L); + } + + check_op(Queue, T(G), intel::maximum(), false, G, L); + if (std::is_floating_point::value || + std::is_same::value) { + check_op(Queue, -std::numeric_limits::infinity(), intel::maximum(), + true, G, L); + } else { + check_op(Queue, std::numeric_limits::min(), intel::maximum(), true, + G, L); + } + +#if __cplusplus >= 201402L + check_op(Queue, T(L), intel::plus<>(), false, G, L); + check_op(Queue, T(0), intel::plus<>(), true, G, L); + + check_op(Queue, T(0), intel::minimum<>(), false, G, L); + if (std::is_floating_point::value || + std::is_same::value) { + check_op(Queue, std::numeric_limits::infinity(), intel::minimum<>(), + true, G, L); + } else { + check_op(Queue, std::numeric_limits::max(), intel::minimum<>(), true, + G, L); + } + + check_op(Queue, T(G), intel::maximum<>(), false, G, L); + if (std::is_floating_point::value || + std::is_same::value) { + check_op(Queue, -std::numeric_limits::infinity(), intel::maximum<>(), + true, G, L); + } else { + check_op(Queue, std::numeric_limits::min(), intel::maximum<>(), true, + G, L); + } +#endif +} + +int main() { + queue Queue; + if (!core_sg_supported(Queue.get_device())) { + std::cout << "Skipping test\n"; + return 0; + } + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + // scan half type is not supported in OCL CPU RT +#ifdef SG_GPU + if (Queue.get_device().has_extension("cl_khr_fp16")) { + check(Queue); + } +#endif + if (Queue.get_device().has_extension("cl_khr_fp64")) { + check(Queue); + } + std::cout << "Test passed." << std::endl; + return 0; +} diff --git a/SYCL/Basic/sub_group/sg.cl b/SYCL/Basic/sub_group/sg.cl new file mode 100644 index 0000000000..0dcee41298 --- /dev/null +++ b/SYCL/Basic/sub_group/sg.cl @@ -0,0 +1,25 @@ +//==-------------- sg.cl - OpenCL reference kernel file --------*- C++ -*---==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// ===--------------------------------------------------------------------=== // +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +struct Data { + uint local_id; + uint local_range; + uint max_local_range; + uint group_id; + uint group_range; + uint uniform_group_range; +}; +__kernel void ocl_subgr(__global struct Data *a) { + uint id = get_global_id(0); + a[id].local_id = get_sub_group_local_id(); + a[id].local_range = get_sub_group_size(); + a[id].max_local_range = get_max_sub_group_size(); + a[id].group_id = get_sub_group_id(); + a[id].group_range = get_num_sub_groups(); + a[id].uniform_group_range = get_num_sub_groups(); +} diff --git a/SYCL/Basic/sub_group/shuffle.cpp b/SYCL/Basic/sub_group/shuffle.cpp new file mode 100644 index 0000000000..bd7e11c89e --- /dev/null +++ b/SYCL/Basic/sub_group/shuffle.cpp @@ -0,0 +1,265 @@ +// UNSUPPORTED: cuda +// CUDA compilation and runtime do not yet support sub-groups. +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// +//==------------ shuffle.cpp - SYCL sub_group shuffle test -----*- C++ -*---==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "helper.hpp" +#include +template class sycl_subgr; + +using namespace cl::sycl; + +// TODO remove this workaround when clang will support correct generation of +// half typename in integration header +struct wa_half; + +template +void check(queue &Queue, size_t G = 240, size_t L = 60) { + try { + nd_range<1> NdRange(G, L); + buffer> buf2(G); + buffer> buf2_up(G); + buffer> buf2_down(G); + buffer> buf(G); + buffer> buf_up(G); + buffer> buf_down(G); + buffer> buf_xor(G); + buffer sgsizebuf(1); + Queue.submit([&](handler &cgh) { + auto acc2 = buf2.template get_access(cgh); + auto acc2_up = buf2_up.template get_access(cgh); + auto acc2_down = + buf2_down.template get_access(cgh); + + auto acc = buf.template get_access(cgh); + auto acc_up = buf_up.template get_access(cgh); + auto acc_down = + buf_down.template get_access(cgh); + auto acc_xor = buf_xor.template get_access(cgh); + auto sgsizeacc = sgsizebuf.get_access(cgh); + + cgh.parallel_for>(NdRange, [=](nd_item<1> NdItem) { + intel::sub_group SG = NdItem.get_sub_group(); + uint32_t wggid = NdItem.get_global_id(0); + uint32_t sgid = SG.get_group_id().get(0); + vec vwggid(wggid), vsgid(sgid); + if (wggid == 0) + sgsizeacc[0] = SG.get_max_local_range()[0]; + /* 1 for odd subgroups and 2 for even*/ + acc2[NdItem.get_global_id()] = + SG.shuffle(vec(1), vec(2), + (sgid % 2) ? 1 : SG.get_max_local_range()[0]); + /* GID-SGID */ + acc2_up[NdItem.get_global_id()] = SG.shuffle_up(vwggid, vwggid, sgid); + /* GID-SGID or SGLID if GID+SGID > SGsize*/ + acc2_down[NdItem.get_global_id()] = + SG.shuffle_down(vwggid, vec(SG.get_local_id().get(0)), sgid); + + /*GID of middle element in every subgroup*/ + acc[NdItem.get_global_id()] = + SG.shuffle(vwggid, SG.get_max_local_range()[0] / 2); + /* Save GID-SGID */ + acc_up[NdItem.get_global_id()] = SG.shuffle_up(vwggid, sgid); + /* Save GID+SGID */ + acc_down[NdItem.get_global_id()] = SG.shuffle_down(vwggid, sgid); + /* Save GID XOR SGID */ + acc_xor[NdItem.get_global_id()] = SG.shuffle_xor(vwggid, sgid); + }); + }); + auto acc = buf.template get_access(); + auto acc_up = buf_up.template get_access(); + auto acc_down = buf_down.template get_access(); + auto acc2 = buf2.template get_access(); + auto acc2_up = buf2_up.template get_access(); + auto acc2_down = buf2_down.template get_access(); + auto acc_xor = buf_xor.template get_access(); + auto sgsizeacc = sgsizebuf.get_access(); + + size_t sg_size = sgsizeacc[0]; + int SGid = 0; + for (int j = 0; j < G; j++) { + if (j % L % sg_size == 0) { + SGid++; + } + if (j % L == 0) { + SGid = 0; + } + /*GID of middle element in every subgroup*/ + exit_if_not_equal_vec( + acc[j], vec(j / L * L + SGid * sg_size + sg_size / 2), + "shuffle"); + /* 1 for odd subgroups and 2 for even*/ + exit_if_not_equal_vec(acc2[j], vec((SGid % 2) ? 1 : 2), + "shuffle2"); + /* Value GID+SGID for all element except last SGID in SG*/ + if (j % L % sg_size + SGid < sg_size && j % L + SGid < L) { + exit_if_not_equal_vec(acc_down[j], vec(j + SGid), "shuffle_down"); + exit_if_not_equal_vec(acc2_down[j], vec(j + SGid), + "shuffle2_down"); + } else { /* SGLID for GID+SGid */ + if (j % L + SGid < L) /* Do not go out LG*/ + exit_if_not_equal_vec(acc2_down[j], + vec((j + SGid) % L % sg_size), + "shuffle2_down"); + } + /* Value GID-SGID for all element except first SGID in SG*/ + if (j % L % sg_size >= SGid) { + exit_if_not_equal_vec(acc_up[j], vec(j - SGid), "shuffle_up"); + exit_if_not_equal_vec(acc2_up[j], vec(j - SGid), "shuffle2_up"); + } else { /* SGLID for GID-SGid */ + if (j % L - SGid + sg_size < L) /* Do not go out LG*/ + exit_if_not_equal_vec(acc2_up[j], vec(j - SGid + sg_size), + "shuffle2_up"); + } + /* GID XOR SGID */ + exit_if_not_equal_vec(acc_xor[j], vec(j ^ SGid), "shuffle_xor"); + } + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + exit(1); + } +} + +template void check(queue &Queue, size_t G = 240, size_t L = 60) { + try { + nd_range<1> NdRange(G, L); + buffer buf2(G); + buffer buf2_up(G); + buffer buf2_down(G); + buffer buf(G); + buffer buf_up(G); + buffer buf_down(G); + buffer buf_xor(G); + buffer sgsizebuf(1); + Queue.submit([&](handler &cgh) { + auto acc2 = buf2.template get_access(cgh); + auto acc2_up = buf2_up.template get_access(cgh); + auto acc2_down = + buf2_down.template get_access(cgh); + + auto acc = buf.template get_access(cgh); + auto acc_up = buf_up.template get_access(cgh); + auto acc_down = + buf_down.template get_access(cgh); + auto acc_xor = buf_xor.template get_access(cgh); + auto sgsizeacc = sgsizebuf.get_access(cgh); + + cgh.parallel_for>(NdRange, [=](nd_item<1> NdItem) { + intel::sub_group SG = NdItem.get_sub_group(); + uint32_t wggid = NdItem.get_global_id(0); + uint32_t sgid = SG.get_group_id().get(0); + if (wggid == 0) + sgsizeacc[0] = SG.get_max_local_range()[0]; + /* 1 for odd subgroups and 2 for even*/ + acc2[NdItem.get_global_id()] = + SG.shuffle(1, 2, (sgid % 2) ? 1 : SG.get_max_local_range()[0]); + /* GID-SGID */ + acc2_up[NdItem.get_global_id()] = SG.shuffle_up(wggid, wggid, sgid); + /* GID-SGID or SGLID if GID+SGID > SGsize*/ + acc2_down[NdItem.get_global_id()] = + SG.shuffle_down(wggid, SG.get_local_id().get(0), sgid); + + /*GID of middle element in every subgroup*/ + acc[NdItem.get_global_id()] = + SG.shuffle(wggid, SG.get_max_local_range()[0] / 2); + /* Save GID-SGID */ + acc_up[NdItem.get_global_id()] = SG.shuffle_up(wggid, sgid); + /* Save GID+SGID */ + acc_down[NdItem.get_global_id()] = SG.shuffle_down(wggid, sgid); + /* Save GID XOR SGID */ + acc_xor[NdItem.get_global_id()] = SG.shuffle_xor(wggid, sgid); + }); + }); + auto acc = buf.template get_access(); + auto acc_up = buf_up.template get_access(); + auto acc_down = buf_down.template get_access(); + auto acc2 = buf2.template get_access(); + auto acc2_up = buf2_up.template get_access(); + auto acc2_down = buf2_down.template get_access(); + auto acc_xor = buf_xor.template get_access(); + auto sgsizeacc = sgsizebuf.get_access(); + + size_t sg_size = sgsizeacc[0]; + int SGid = 0; + for (int j = 0; j < G; j++) { + if (j % L % sg_size == 0) { + SGid++; + } + if (j % L == 0) { + SGid = 0; + } + /*GID of middle element in every subgroup*/ + exit_if_not_equal(acc[j], j / L * L + SGid * sg_size + sg_size / 2, + "shuffle"); + /* 1 for odd subgroups and 2 for even*/ + exit_if_not_equal(acc2[j], (SGid % 2) ? 1 : 2, "shuffle2"); + /* Value GID+SGID for all element except last SGID in SG*/ + if (j % L % sg_size + SGid < sg_size && j % L + SGid < L) { + exit_if_not_equal(acc_down[j], j + SGid, "shuffle_down"); + exit_if_not_equal(acc2_down[j], j + SGid, "shuffle2_down"); + } else { /* SGLID for GID+SGid */ + if (j % L + SGid < L) /* Do not go out LG*/ + exit_if_not_equal(acc2_down[j], (j + SGid) % L % sg_size, + "shuffle2_down"); + } + /* Value GID-SGID for all element except first SGID in SG*/ + if (j % L % sg_size >= SGid) { + exit_if_not_equal(acc_up[j], j - SGid, "shuffle_up"); + exit_if_not_equal(acc2_up[j], j - SGid, "shuffle2_up"); + } else { /* SGLID for GID-SGid */ + if (j % L - SGid + sg_size < L) /* Do not go out LG*/ + exit_if_not_equal(acc2_up[j], j - SGid + sg_size, "shuffle2_up"); + } + /* GID XOR SGID */ + exit_if_not_equal(acc_xor[j], j ^ SGid, "shuffle_xor"); + } + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + exit(1); + } +} +int main() { + queue Queue; + if (!Queue.get_device().has_extension("cl_intel_subgroups")) { + std::cout << "Skipping test\n"; + return 0; + } + + if (Queue.get_device().has_extension("cl_intel_subgroups_short")) { + check(Queue); + check(Queue); + } + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + if (Queue.get_device().has_extension("cl_khr_fp16")) { + check(Queue); + } + check(Queue); + if (Queue.get_device().has_extension("cl_khr_fp64")) { + check(Queue); + } + std::cout << "Test passed." << std::endl; + return 0; +} diff --git a/SYCL/Basic/sub_group/vote.cpp b/SYCL/Basic/sub_group/vote.cpp new file mode 100644 index 0000000000..df6c5595fb --- /dev/null +++ b/SYCL/Basic/sub_group/vote.cpp @@ -0,0 +1,89 @@ +// UNSUPPORTED: cuda +// CUDA compilation and runtime do not yet support sub-groups. +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +//==--------------- vote.cpp - SYCL sub_group vote test --*- C++ -*---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "helper.hpp" +#include +using namespace cl::sycl; + +void check(queue Queue, const int G, const int L, const int D, const int R) { + try { + int max_sg = + Queue.get_device().get_info(); + int num_sg = (L) / max_sg + ((L) % max_sg ? 1 : 0); + range<1> GRange(G), LRange(L); + nd_range<1> NdRange(GRange, LRange); + buffer sganybuf(G); + buffer sgallbuf(G); + + // Initialise buffer with zeros + Queue.submit([&](handler &cgh) { + auto sganyacc = sganybuf.get_access(cgh); + auto sgallacc = sgallbuf.get_access(cgh); + cgh.parallel_for(range<1>{(unsigned)G}, [=](id<1> index) { + sganyacc[index] = 0; + sgallacc[index] = 0; + }); + }); + + Queue.submit([&](handler &cgh) { + auto sganyacc = sganybuf.get_access(cgh); + auto sgallacc = sgallbuf.get_access(cgh); + cgh.parallel_for(NdRange, [=](nd_item<1> NdItem) { + sganyacc[NdItem.get_global_id()] = 0; + sgallacc[NdItem.get_global_id()] = 0; + }); + }); + + Queue.submit([&](handler &cgh) { + auto sganyacc = sganybuf.get_access(cgh); + auto sgallacc = sgallbuf.get_access(cgh); + cgh.parallel_for(NdRange, [=](nd_item<1> NdItem) { + intel::sub_group SG = NdItem.get_sub_group(); + /* Set to 1 if any local ID in subgroup devided by D has remainder R */ + if (any_of(SG, SG.get_local_id().get(0) % D == R)) { + sganyacc[NdItem.get_global_id()] = 1; + } + /* Set to 1 if remainder of division of subgroup local ID by D is less + * than R for all work items in subgroup */ + if (all_of(SG, SG.get_local_id().get(0) % D < R)) { + sgallacc[NdItem.get_global_id()] = 1; + } + }); + }); + auto sganyacc = sganybuf.get_access(); + auto sgallacc = sgallbuf.get_access(); + for (int j = 0; j < G; j++) { + exit_if_not_equal(sganyacc[j], (int)(D > R), "any"); + exit_if_not_equal(sgallacc[j], (int)(D <= R), "all"); + } + + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + exit(1); + } +} +int main() { + queue Queue; + if (!core_sg_supported(Queue.get_device())) { + std::cout << "Skipping test\n"; + return 0; + } + check(Queue, 240, 80, 9, 8); + check(Queue, 24, 12, 9, 10); + check(Queue, 1024, 256, 9, 8); + std::cout << "Test passed." << std::endl; +} diff --git a/SYCL/Basic/usm/allocator_vector.cpp b/SYCL/Basic/usm/allocator_vector.cpp new file mode 100644 index 0000000000..e111ce873e --- /dev/null +++ b/SYCL/Basic/usm/allocator_vector.cpp @@ -0,0 +1,130 @@ +// XFAIL: cuda || level0 +// piextUSM*Alloc functions for CUDA are not behaving as described in +// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc +// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out +// RUN: %HOST_RUN_PLACEHOLDER %t1.out +// RUN: %CPU_RUN_PLACEHOLDER %t1.out +// RUN: %GPU_RUN_PLACEHOLDER %t1.out + +//==---- allocator_vector.cpp - Allocator Container test -------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#include + +using namespace cl::sycl; + +const int N = 8; + +class foo; +int main() { + queue q; + auto dev = q.get_device(); + auto ctxt = q.get_context(); + + if (dev.get_info()) { + usm_allocator alloc(ctxt, dev); + + std::vector vec(alloc); + vec.resize(N); + + for (int i = 0; i < N; i++) { + vec[i] = i; + } + + int *res = &vec[0]; + int *vals = &vec[0]; + + auto e1 = q.submit([=](handler &h) { + h.single_task([=]() { + for (int i = 1; i < N; i++) { + res[0] += vals[i]; + } + }); + }); + + e1.wait(); + + int answer = (N * (N - 1)) / 2; + + if (vec[0] != answer) + return -1; + } + + if (dev.get_info()) { + usm_allocator alloc(ctxt, dev); + + std::vector vec(alloc); + vec.resize(N); + + for (int i = 0; i < N; i++) { + vec[i] = i; + } + + int *res = &vec[0]; + int *vals = &vec[0]; + + auto e1 = q.submit([=](handler &h) { + h.single_task([=]() { + for (int i = 1; i < N; i++) { + res[0] += vals[i]; + } + }); + }); + + e1.wait(); + + int answer = (N * (N - 1)) / 2; + + if (vec[0] != answer) + return -1; + } + + if (dev.get_info()) { + usm_allocator alloc(ctxt, dev); + + std::vector vec(alloc); + vec.resize(N); + + int *res = &vec[0]; + int *vals = &vec[0]; + + auto e0 = q.submit([=](handler &h) { + h.single_task([=]() { + res[0] = 0; + for (int i = 0; i < N; i++) { + vals[i] = i; + } + }); + }); + + auto e1 = q.submit([=](handler &h) { + h.depends_on(e0); + h.single_task([=]() { + for (int i = 1; i < N; i++) { + res[0] += vals[i]; + } + }); + }); + + e1.wait(); + + int answer = (N * (N - 1)) / 2; + int result; + q.memcpy(&result, res, sizeof(int)); + q.wait(); + + if (result != answer) + return -1; + } + + return 0; +} diff --git a/SYCL/Basic/usm/allocator_vector_fail.cpp b/SYCL/Basic/usm/allocator_vector_fail.cpp new file mode 100644 index 0000000000..5a310c97ff --- /dev/null +++ b/SYCL/Basic/usm/allocator_vector_fail.cpp @@ -0,0 +1,48 @@ +// XFAIL: cuda +// piextUSM*Alloc functions for CUDA are not behaving as described in +// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc +// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out +// RUN: %HOST_RUN_PLACEHOLDER %t1.out +// RUN: %CPU_RUN_PLACEHOLDER %t1.out +// RUN: %GPU_RUN_PLACEHOLDER %t1.out + +//==-- allocator_vector_fail.cpp - Device Memory Allocator fail test -------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#include + +using namespace cl::sycl; + +const int N = 8; + +class foo; +int main() { + queue q; + auto dev = q.get_device(); + auto ctxt = q.get_context(); + + if (dev.get_info()) { + try { + usm_allocator alloc(ctxt, dev); + std::vector vec(alloc); + + // This statement should throw an exception since + // device pointers may not be accessed on the host. + vec.assign(N, 42); + } catch (feature_not_supported) { + return 0; + } + + return -1; + } + return 0; +} diff --git a/SYCL/Basic/usm/allocatorll.cpp b/SYCL/Basic/usm/allocatorll.cpp new file mode 100644 index 0000000000..f40a1bb84f --- /dev/null +++ b/SYCL/Basic/usm/allocatorll.cpp @@ -0,0 +1,88 @@ +// XFAIL: cuda +// piextUSM*Alloc functions for CUDA are not behaving as described in +// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc +// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out +// RUN: %HOST_RUN_PLACEHOLDER %t1.out +// RUN: %CPU_RUN_PLACEHOLDER %t1.out +// RUN: %GPU_RUN_PLACEHOLDER %t1.out + +//==---- allocatorll.cpp - Device Memory Linked List Allocator test --------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cl::sycl; + +int numNodes = 4; + +struct Node { + Node() : pNext(nullptr), Num(0xDEADBEEF) {} + + Node *pNext; + uint32_t Num; +}; + +class foo; +int main() { + queue q; + auto dev = q.get_device(); + auto ctxt = q.get_context(); + + if (!dev.get_info()) + return 0; + + usm_allocator alloc(ctxt, dev); + Node h_cur; + + Node *d_head = alloc.allocate(1); + Node *d_cur = d_head; + + for (int i = 0; i < numNodes; i++) { + h_cur.Num = i * 2; + + if (i != (numNodes - 1)) { + h_cur.pNext = alloc.allocate(1); + } else { + h_cur.pNext = nullptr; + } + + event e0 = q.memcpy(d_cur, &h_cur, sizeof(Node)); + e0.wait(); + + d_cur = h_cur.pNext; + } + + auto e1 = q.submit([=](handler &cgh) { + cgh.single_task([=]() { + Node *pHead = d_head; + while (pHead) { + pHead->Num = pHead->Num * 2 + 1; + pHead = pHead->pNext; + } + }); + }); + + e1.wait(); + + d_cur = d_head; + for (int i = 0; i < numNodes; i++) { + event c = q.memcpy(&h_cur, d_cur, sizeof(Node)); + c.wait(); + alloc.deallocate(d_cur, 1); + + const int want = i * 4 + 1; + if (h_cur.Num != want) { + return -2; + } + d_cur = h_cur.pNext; + } + + return 0; +} diff --git a/SYCL/Basic/usm/badmalloc.cpp b/SYCL/Basic/usm/badmalloc.cpp new file mode 100644 index 0000000000..2c14c41676 --- /dev/null +++ b/SYCL/Basic/usm/badmalloc.cpp @@ -0,0 +1,78 @@ +// UNSUPPORTED: windows +// XFAIL: cuda +// piextUSM*Alloc functions for CUDA are not behaving as described in +// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc +// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out +// RUN: %HOST_RUN_PLACEHOLDER %t1.out +// RUN: %CPU_RUN_PLACEHOLDER %t1.out +// RUN: %GPU_RUN_PLACEHOLDER %t1.out + +//==----------------- badmalloc.cpp - Bad Mallocs test ---------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// This test verifies that things fail in the proper way when they should. + +#include +#include + +using namespace cl::sycl; + +int main(int argc, char *argv[]) { + queue q; + + // Good size, bad type + auto p = malloc(8, q, usm::alloc::unknown); + if (p != nullptr) + return 1; + + // Bad size, host + p = malloc(-1, q, usm::alloc::host); + std::cout << "p = " << p << std::endl; + if (p != nullptr) + return 2; + p = malloc(-1, q, usm::alloc::device); + std::cout << "p = " << p << std::endl; + if (p != nullptr) + return 3; + p = malloc(-1, q, usm::alloc::shared); + std::cout << "p = " << p << std::endl; + if (p != nullptr) + return 4; + p = malloc(-1, q, usm::alloc::unknown); + std::cout << "p = " << p << std::endl; + if (p != nullptr) + return 5; + + // Bad size, auto aligned + p = aligned_alloc(0, -1, q, usm::alloc::host); + std::cout << "p = " << p << std::endl; + if (p != nullptr) + return 6; + p = aligned_alloc(0, -1, q, usm::alloc::device); + std::cout << "p = " << p << std::endl; + if (p != nullptr) + return 7; + p = aligned_alloc(0, -1, q, usm::alloc::shared); + std::cout << "p = " << p << std::endl; + if (p != nullptr) + return 8; + p = aligned_alloc(0, -1, q, usm::alloc::unknown); + std::cout << "p = " << p << std::endl; + if (p != nullptr) + return 9; + + // Allocs of 0 undefined, but bad type + p = aligned_alloc(4, 0, q, usm::alloc::unknown); + std::cout << "p = " << p << std::endl; + if (p != nullptr) + return 10; + + return 0; +} diff --git a/SYCL/Basic/usm/depends_on.cpp b/SYCL/Basic/usm/depends_on.cpp new file mode 100644 index 0000000000..0e8602b838 --- /dev/null +++ b/SYCL/Basic/usm/depends_on.cpp @@ -0,0 +1,86 @@ +// XFAIL: cuda +// piextUSM*Alloc functions for CUDA are not behaving as described in +// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc +// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out +// RUN: %HOST_RUN_PLACEHOLDER %t1.out +// RUN: %CPU_RUN_PLACEHOLDER %t1.out +// RUN: %GPU_RUN_PLACEHOLDER %t1.out + +//==----------------- depends_on.cpp - depends_on test ---------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cl::sycl; + +class foo; +int main() { + const int N = 4; + const int MAGIC_NUM = 42; + + queue q; + auto dev = q.get_device(); + auto ctxt = q.get_context(); + + if (!(dev.get_info() && + dev.get_info() && + dev.get_info())) + return 0; + + int *darray = (int *)malloc_device(N * sizeof(int), dev, ctxt); + if (darray == nullptr) { + return -1; + } + int *sarray = (int *)malloc_shared(N * sizeof(int), dev, ctxt); + + if (sarray == nullptr) { + return -1; + } + + int *harray = (int *)malloc_host(N * sizeof(int), ctxt); + if (harray == nullptr) { + return -1; + } + + event e; + auto eInit = q.submit([&](handler &cgh) { + cgh.depends_on(e); + cgh.single_task([=]() { + for (int i = 0; i < N; i++) { + sarray[i] = MAGIC_NUM - 1; + harray[i] = 1; + } + }); + }); + + auto eMemset = q.memset(darray, 0, N * sizeof(int)); + + auto eKernel = q.submit([=](handler &cgh) { + cgh.depends_on({eInit, eMemset}); + cgh.single_task([=]() { + for (int i = 0; i < N; i++) { + sarray[i] += darray[i] + harray[i]; + } + }); + }); + + eKernel.wait(); + + for (int i = 0; i < N; i++) { + if (sarray[i] != MAGIC_NUM) { + return -2; + } + } + free(darray, ctxt); + free(sarray, ctxt); + free(harray, ctxt); + + return 0; +} diff --git a/SYCL/Basic/usm/dmemll.cpp b/SYCL/Basic/usm/dmemll.cpp new file mode 100644 index 0000000000..bbbbae1213 --- /dev/null +++ b/SYCL/Basic/usm/dmemll.cpp @@ -0,0 +1,93 @@ +// XFAIL: cuda +// piextUSM*Alloc functions for CUDA are not behaving as described in +// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc +// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out +// RUN: %HOST_RUN_PLACEHOLDER %t1.out +// RUN: %CPU_RUN_PLACEHOLDER %t1.out +// RUN: %GPU_RUN_PLACEHOLDER %t1.out + +//==------------------- dmemll.cpp - Device Memory Linked List test --------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cl::sycl; + +int numNodes = 4; + +struct Node { + Node() : pNext(nullptr), Num(0xDEADBEEF) {} + + Node *pNext; + uint32_t Num; +}; + +class foo; +int main() { + queue q; + auto dev = q.get_device(); + auto ctxt = q.get_context(); + + if (!dev.get_info()) + return 0; + + Node h_cur; + + Node *d_head = (Node *)malloc_device(sizeof(Node), dev, ctxt); + if (d_head == nullptr) { + return -1; + } + Node *d_cur = d_head; + + for (int i = 0; i < numNodes; i++) { + h_cur.Num = i * 2; + + if (i != (numNodes - 1)) { + h_cur.pNext = (Node *)malloc_device(sizeof(Node), dev, ctxt); + if (h_cur.pNext == nullptr) { + return -1; + } + } else { + h_cur.pNext = nullptr; + } + + event e0 = q.memcpy(d_cur, &h_cur, sizeof(Node)); + e0.wait(); + + d_cur = h_cur.pNext; + } + + auto e1 = q.submit([=](handler &cgh) { + cgh.single_task([=]() { + Node *pHead = d_head; + while (pHead) { + pHead->Num = pHead->Num * 2 + 1; + pHead = pHead->pNext; + } + }); + }); + + e1.wait(); + + d_cur = d_head; + for (int i = 0; i < numNodes; i++) { + event c = q.memcpy(&h_cur, d_cur, sizeof(Node)); + c.wait(); + free(d_cur, ctxt); + + const int want = i * 4 + 1; + if (h_cur.Num != want) { + return -2; + } + d_cur = h_cur.pNext; + } + + return 0; +} diff --git a/SYCL/Basic/usm/dmemllaligned.cpp b/SYCL/Basic/usm/dmemllaligned.cpp new file mode 100644 index 0000000000..6daeb8adca --- /dev/null +++ b/SYCL/Basic/usm/dmemllaligned.cpp @@ -0,0 +1,90 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out +// RUN: %HOST_RUN_PLACEHOLDER %t1.out +// RUN: %CPU_RUN_PLACEHOLDER %t1.out +// RUN: %GPU_RUN_PLACEHOLDER %t1.out + +//==---- dmemllaligned.cpp - Aligned Device Memory Linked List test --------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cl::sycl; + +int numNodes = 4; + +struct Node { + Node() : pNext(nullptr), Num(0xDEADBEEF) {} + + Node *pNext; + uint32_t Num; +}; + +class foo; +int main() { + queue q; + auto dev = q.get_device(); + auto ctxt = q.get_context(); + + if (!dev.get_info()) + return 0; + + Node h_cur; + + Node *d_head = + (Node *)aligned_alloc_device(alignof(Node), sizeof(Node), dev, ctxt); + if (d_head == nullptr) { + return -1; + } + Node *d_cur = d_head; + + for (int i = 0; i < numNodes; i++) { + h_cur.Num = i * 2; + + if (i != (numNodes - 1)) { + h_cur.pNext = + (Node *)aligned_alloc_device(alignof(Node), sizeof(Node), dev, ctxt); + if (h_cur.pNext == nullptr) { + return -1; + } + } else { + h_cur.pNext = nullptr; + } + + event e0 = q.memcpy(d_cur, &h_cur, sizeof(Node)); + e0.wait(); + + d_cur = h_cur.pNext; + } + + auto e1 = q.submit([=](handler &cgh) { + cgh.single_task([=]() { + Node *pHead = d_head; + while (pHead) { + pHead->Num = pHead->Num * 2 + 1; + pHead = pHead->pNext; + } + }); + }); + + e1.wait(); + + d_cur = d_head; + for (int i = 0; i < numNodes; i++) { + event c = q.memcpy(&h_cur, d_cur, sizeof(Node)); + c.wait(); + free(d_cur, ctxt); + + const int want = i * 4 + 1; + if (h_cur.Num != want) { + return -2; + } + d_cur = h_cur.pNext; + } + + return 0; +} diff --git a/SYCL/Basic/usm/findplatforms.hpp b/SYCL/Basic/usm/findplatforms.hpp new file mode 100644 index 0000000000..592464385a --- /dev/null +++ b/SYCL/Basic/usm/findplatforms.hpp @@ -0,0 +1,45 @@ +//==------------------- findplatforms.hpp ----------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +bool findPlatformAndDevice(cl_device_type deviceType, + cl_platform_id &platformOut, cl_device_id &deviceOut) { + cl_uint numPlatforms; + cl_int errorCode; + + errorCode = clGetPlatformIDs(0, nullptr, &numPlatforms); + if (errorCode != CL_SUCCESS) return false; + + std::vector platforms(numPlatforms); + errorCode = clGetPlatformIDs(numPlatforms, platforms.data(), nullptr); + if (errorCode != CL_SUCCESS) return false; + + for (auto platform : platforms) { + cl_uint numDevices = 0; + errorCode = + clGetDeviceIDs(platform, deviceType, 0, nullptr, &numDevices); + + // This has to check both codes because if a platform has 0 devices + // of deviceType, clGetPlatformIDs returns CL_DEVICE_NOT_FOUND. + // We don't want to bail yet as the next platform might have it. + // We bail out here if we see something other than those two error codes. + if (!(errorCode == CL_SUCCESS || errorCode == CL_DEVICE_NOT_FOUND)) + return false; + + if (numDevices) { + std::vector devices(numDevices); + errorCode = clGetDeviceIDs(platform, deviceType, numDevices, + devices.data(), nullptr); + if (errorCode != CL_SUCCESS) return false; + + platformOut = platform; + deviceOut = devices[0]; + return true; + } + } + + return false; +} diff --git a/SYCL/Basic/usm/hmemll.cpp b/SYCL/Basic/usm/hmemll.cpp new file mode 100644 index 0000000000..72dd514b74 --- /dev/null +++ b/SYCL/Basic/usm/hmemll.cpp @@ -0,0 +1,86 @@ +// XFAIL: cuda +// piextUSM*Alloc functions for CUDA are not behaving as described in +// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc +// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out +// RUN: %HOST_RUN_PLACEHOLDER %t1.out +// RUN: %CPU_RUN_PLACEHOLDER %t1.out +// RUN: %GPU_RUN_PLACEHOLDER %t1.out + +//==------------------- hmemll.cpp - Host Memory Linked List test ----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cl::sycl; + +int numNodes = 4; + +struct Node { + Node() : pNext(nullptr), Num(0xDEADBEEF) {} + + Node *pNext; + uint32_t Num; +}; + +class foo; +int main() { + queue q; + auto ctxt = q.get_context(); + auto dev = q.get_device(); + + if (!dev.get_info()) + return 0; + + Node *h_head = (Node *)malloc_host(sizeof(Node), ctxt); + if (h_head == nullptr) { + return -1; + } + Node *h_cur = h_head; + + for (int i = 0; i < numNodes; i++) { + h_cur->Num = i * 2; + + if (i != (numNodes - 1)) { + h_cur->pNext = (Node *)malloc_host(sizeof(Node), ctxt); + if (h_cur->pNext == nullptr) { + return -1; + } + } else { + h_cur->pNext = nullptr; + } + + h_cur = h_cur->pNext; + } + + auto e1 = q.submit([=](handler &cgh) { + cgh.single_task([=]() { + Node *pHead = h_head; + while (pHead) { + pHead->Num = pHead->Num * 2 + 1; + pHead = pHead->pNext; + } + }); + }); + + e1.wait(); + + h_cur = h_head; + for (int i = 0; i < numNodes; i++) { + const int want = i * 4 + 1; + if (h_cur->Num != want) { + return -2; + } + Node *old = h_cur; + h_cur = h_cur->pNext; + free(old, ctxt); + } + + return 0; +} diff --git a/SYCL/Basic/usm/hmemllaligned.cpp b/SYCL/Basic/usm/hmemllaligned.cpp new file mode 100644 index 0000000000..b08038d068 --- /dev/null +++ b/SYCL/Basic/usm/hmemllaligned.cpp @@ -0,0 +1,82 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out +// RUN: %HOST_RUN_PLACEHOLDER %t1.out +// RUN: %CPU_RUN_PLACEHOLDER %t1.out +// RUN: %GPU_RUN_PLACEHOLDER %t1.out + +//==---- hmemllaligned.cpp - Aligned Host Memory Linked List test ----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cl::sycl; + +int numNodes = 4; + +struct Node { + Node() : pNext(nullptr), Num(0xDEADBEEF) {} + + Node *pNext; + uint32_t Num; +}; + +class foo; +int main() { + queue q; + auto ctxt = q.get_context(); + auto dev = q.get_device(); + + if (!dev.get_info()) + return 0; + + Node *h_head = (Node *)aligned_alloc_host(alignof(Node), sizeof(Node), ctxt); + if (h_head == nullptr) { + return -1; + } + Node *h_cur = h_head; + + for (int i = 0; i < numNodes; i++) { + h_cur->Num = i * 2; + + if (i != (numNodes - 1)) { + h_cur->pNext = + (Node *)aligned_alloc_host(alignof(Node), sizeof(Node), ctxt); + if (h_cur->pNext == nullptr) { + return -1; + } + } else { + h_cur->pNext = nullptr; + } + + h_cur = h_cur->pNext; + } + + auto e1 = q.submit([=](handler &cgh) { + cgh.single_task([=]() { + Node *pHead = h_head; + while (pHead) { + pHead->Num = pHead->Num * 2 + 1; + pHead = pHead->pNext; + } + }); + }); + + e1.wait(); + + h_cur = h_head; + for (int i = 0; i < numNodes; i++) { + const int want = i * 4 + 1; + if (h_cur->Num != want) { + return -2; + } + Node *old = h_cur; + h_cur = h_cur->pNext; + free(old, ctxt); + } + + return 0; +} diff --git a/SYCL/Basic/usm/math.cpp b/SYCL/Basic/usm/math.cpp new file mode 100644 index 0000000000..583a9fb9cd --- /dev/null +++ b/SYCL/Basic/usm/math.cpp @@ -0,0 +1,134 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out + +// REQUIRES: cpu +// XFAIL: cuda +// TODO: ptxas fatal : Unresolved extern function '_Z20__spirv_ocl_lgamma_rfPi' + +#include + +#include +#include +#include + +namespace s = cl::sycl; + +int main() { + s::queue myQueue; + + if (myQueue.get_device() + .get_info()) { + // fract with unified shared memory + { + s::cl_float r{0}; + s::cl_float i{999}; + { + s::cl_float *Buf = (s::cl_float *)s::malloc_shared( + sizeof(cl_float) * 2, myQueue.get_device(), myQueue.get_context()); + s::malloc_shared(100, myQueue.get_device(), myQueue.get_context()); + myQueue.submit([&](s::handler &cgh) { + cgh.single_task( + [=]() { Buf[0] = s::fract(s::cl_float{1.5f}, &Buf[1]); }); + }); + myQueue.wait(); + r = Buf[0]; + i = Buf[1]; + s::free(Buf, myQueue.get_context()); + } + assert(r == 0.5f); + assert(i == 1.0f); + } + + // vector fract with unified shared memory + { + s::cl_float2 *Buf = (s::cl_float2 *)s::malloc_shared( + sizeof(cl_float2) * 2, myQueue.get_device(), myQueue.get_context()); + myQueue.submit([&](s::handler &cgh) { + cgh.single_task([=]() { + Buf[0] = s::fract(s::cl_float2{1.5f, 2.5f}, &Buf[1]); + }); + }); + myQueue.wait(); + + s::cl_float r1 = Buf[0].x(); + s::cl_float r2 = Buf[0].y(); + s::cl_float i1 = Buf[1].x(); + s::cl_float i2 = Buf[1].y(); + + assert(r1 == 0.5f); + assert(r2 == 0.5f); + assert(i1 == 1.0f); + assert(i2 == 2.0f); + } + + // lgamma_r with unified shared memory + { + s::cl_float r{0}; + s::cl_int i{999}; + { + s::buffer BufR(&r, s::range<1>(1)); + s::cl_int *BufI = (s::cl_int *)s::malloc_shared( + sizeof(cl_int) * 2, myQueue.get_device(), myQueue.get_context()); + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task( + [=]() { AccR[0] = s::lgamma_r(s::cl_float{10.f}, BufI); }); + }); + myQueue.wait(); + i = *BufI; + s::free(BufI, myQueue.get_context()); + } + assert(r > 12.8017f && r < 12.8019f); // ~12.8018 + assert(i == 1); // tgamma of 10 is ~362880.0 + } + + // lgamma_r with unified shared memory + { + s::cl_float r{0}; + s::cl_int i{999}; + { + s::buffer BufR(&r, s::range<1>(1)); + s::cl_int *BufI = (s::cl_int *)s::malloc_shared( + sizeof(cl_int) * 2, myQueue.get_device(), myQueue.get_context()); + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task( + [=]() { AccR[0] = s::lgamma_r(s::cl_float{-2.4f}, BufI); }); + }); + myQueue.wait(); + i = *BufI; + s::free(BufI, myQueue.get_context()); + } + assert(r > 0.1024f && r < 0.1026f); // ~0.102583 + assert(i == -1); // tgamma of -2.4 is ~-1.1080299470333461 + } + + // vector lgamma_r with unified shared memory + { + s::cl_float2 r{0, 0}; + s::cl_int2 i{0, 0}; + s::buffer BufR(&r, s::range<1>(1)); + s::cl_int2 *BufI = (s::cl_int2 *)s::malloc_shared( + sizeof(cl_int2) * 2, myQueue.get_device(), myQueue.get_context()); + myQueue.submit([&](s::handler &cgh) { + auto AccR = BufR.get_access(cgh); + cgh.single_task([=]() { + AccR[0] = s::lgamma_r(s::cl_float2{10.f, -2.4f}, BufI); + }); + }); + myQueue.wait(); + + s::cl_float r1 = r.x(); + s::cl_float r2 = r.y(); + s::cl_int i1 = BufI->x(); + s::cl_int i2 = BufI->y(); + + assert(r1 > 12.8017f && r1 < 12.8019f); // ~12.8018 + assert(r2 > 0.1024f && r2 < 0.1026f); // ~0.102583 + assert(i1 == 1); // tgamma of 10 is ~362880.0 + assert(i2 == -1); // tgamma of -2.4 is ~-1.1080299470333461 + } + } + return 0; +} diff --git a/SYCL/Basic/usm/memadvise.cpp b/SYCL/Basic/usm/memadvise.cpp new file mode 100644 index 0000000000..87e4c6f47b --- /dev/null +++ b/SYCL/Basic/usm/memadvise.cpp @@ -0,0 +1,87 @@ +// XFAIL: cuda +// SYCL runtime and piextUSM*Alloc functions for CUDA not behaving as described +// in: https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out +// RUN: %HOST_RUN_PLACEHOLDER %t1.out +// RUN: %CPU_RUN_PLACEHOLDER %t1.out +// RUN: %GPU_RUN_PLACEHOLDER %t1.out + +//==---------------- memadvise.cpp - Shared Memory Linked List test --------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cl::sycl; + +int numNodes = 4; + +struct Node { + Node() : pNext(nullptr), Num(0xDEADBEEF) {} + + Node *pNext; + uint32_t Num; +}; + +class foo; +int main() { + queue q; + auto dev = q.get_device(); + auto ctxt = q.get_context(); + if (!dev.get_info()) + return 0; + + Node *s_head = (Node *)malloc_shared(sizeof(Node), dev, ctxt); + if (s_head == nullptr) { + return -1; + } + q.mem_advise(s_head, sizeof(Node), PI_MEM_ADVICE_SET_READ_MOSTLY); + Node *s_cur = s_head; + + for (int i = 0; i < numNodes; i++) { + s_cur->Num = i * 2; + + if (i != (numNodes - 1)) { + s_cur->pNext = (Node *)malloc_shared(sizeof(Node), dev, ctxt); + if (s_cur->pNext == nullptr) { + return -1; + } + q.mem_advise(s_cur->pNext, sizeof(Node), PI_MEM_ADVICE_SET_READ_MOSTLY); + } else { + s_cur->pNext = nullptr; + } + + s_cur = s_cur->pNext; + } + + auto e1 = q.submit([=](handler &cgh) { + cgh.single_task([=]() { + Node *pHead = s_head; + while (pHead) { + pHead->Num = pHead->Num * 2 + 1; + pHead = pHead->pNext; + } + }); + }); + + e1.wait(); + + s_cur = s_head; + int mismatches = 0; + for (int i = 0; i < numNodes; i++) { + const int want = i * 4 + 1; + if (s_cur->Num != want) { + return -2; + } + Node *old = s_cur; + s_cur = s_cur->pNext; + free(old, ctxt); + } + + return 0; +} diff --git a/SYCL/Basic/usm/memcpy.cpp b/SYCL/Basic/usm/memcpy.cpp new file mode 100644 index 0000000000..0b933d0f00 --- /dev/null +++ b/SYCL/Basic/usm/memcpy.cpp @@ -0,0 +1,63 @@ +//==---- memcpy.cpp - USM memcpy test --------------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// XFAIL: cuda +// piextUSM*Alloc functions for CUDA are not behaving as described in +// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc +// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out +// RUN: %CPU_RUN_PLACEHOLDER %t1.out +// RUN: %GPU_RUN_PLACEHOLDER %t1.out + +#include + +using namespace cl::sycl; + +static constexpr int count = 100; + +int main() { + queue q([](exception_list el) { + for (auto &e : el) + std::rethrow_exception(e); + }); + if (q.get_device().get_info()) { + float *src = (float *)malloc_shared(sizeof(float) * count, q.get_device(), + q.get_context()); + float *dest = (float *)malloc_shared(sizeof(float) * count, q.get_device(), + q.get_context()); + for (int i = 0; i < count; i++) + src[i] = i; + + event init_copy = q.submit( + [&](handler &cgh) { cgh.memcpy(dest, src, sizeof(float) * count); }); + + q.submit([&](handler &cgh) { + cgh.depends_on(init_copy); + cgh.single_task([=]() { + for (int i = 0; i < count; i++) + dest[i] *= 2; + }); + }); + q.wait_and_throw(); + + for (int i = 0; i < count; i++) { + assert(dest[i] == i * 2); + } + + try { + // Copying to nullptr should throw. + q.submit([&](handler &cgh) { + cgh.memcpy(nullptr, src, sizeof(float) * count); + }); + q.wait_and_throw(); + assert(false && "Expected error from copying to nullptr"); + } catch (runtime_error e) { + } + } + return 0; +} diff --git a/SYCL/Basic/usm/memset.cpp b/SYCL/Basic/usm/memset.cpp new file mode 100644 index 0000000000..313fa4cbda --- /dev/null +++ b/SYCL/Basic/usm/memset.cpp @@ -0,0 +1,59 @@ +// XFAIL: cuda +// piextUSM*Alloc functions for CUDA are not behaving as described in +// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc +// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out +// RUN: %CPU_RUN_PLACEHOLDER %t1.out +// RUN: %GPU_RUN_PLACEHOLDER %t1.out + +//==---- memset.cpp - USM memset test --------------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include + +using namespace cl::sycl; + +static constexpr int count = 100; + +int main() { + queue q([](exception_list el) { + for (auto &e : el) + std::rethrow_exception(e); + }); + if (q.get_device().get_info()) { + uint32_t *src = (uint32_t *)malloc_shared(sizeof(uint32_t) * count, + q.get_device(), q.get_context()); + + event init_copy = q.submit( + [&](handler &cgh) { cgh.memset(src, 0x15, sizeof(uint32_t) * count); }); + + q.submit([&](handler &cgh) { + cgh.depends_on(init_copy); + cgh.single_task([=]() { + for (int i = 0; i < count; i++) + src[i] *= 2; + }); + }); + q.wait_and_throw(); + + for (int i = 0; i < count; i++) { + assert(src[i] == 0x2a2a2a2a); + } + + try { + // Filling to nullptr should throw. + q.submit([&](handler &cgh) { + cgh.memset(nullptr, 0, sizeof(uint32_t) * count); + }); + q.wait_and_throw(); + assert(false && "Expected error from writing to nullptr"); + } catch (runtime_error e) { + } + } + return 0; +} diff --git a/SYCL/Basic/usm/mixed.cpp b/SYCL/Basic/usm/mixed.cpp new file mode 100644 index 0000000000..afb06370c5 --- /dev/null +++ b/SYCL/Basic/usm/mixed.cpp @@ -0,0 +1,79 @@ +// XFAIL: cuda +// piextUSM*Alloc functions for CUDA are not behaving as described in +// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc +// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out +// RUN: %HOST_RUN_PLACEHOLDER %t1.out +// RUN: %CPU_RUN_PLACEHOLDER %t1.out +// RUN: %GPU_RUN_PLACEHOLDER %t1.out + +//==------------------- mixed.cpp - Mixed Memory test ---------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cl::sycl; + +class foo; +int main() { + const int N = 4; + const int MAGIC_NUM = 42; + + queue q; + auto dev = q.get_device(); + auto ctxt = q.get_context(); + + if (!(dev.get_info() && + dev.get_info() && + dev.get_info())) + return 0; + + int *darray = (int *)malloc_device(N * sizeof(int), dev, ctxt); + if (darray == nullptr) { + return -1; + } + int *sarray = (int *)malloc_shared(N * sizeof(int), dev, ctxt); + + if (sarray == nullptr) { + return -1; + } + + int *harray = (int *)malloc_host(N * sizeof(int), ctxt); + if (harray == nullptr) { + return -1; + } + for (int i = 0; i < N; i++) { + sarray[i] = MAGIC_NUM - 1; + harray[i] = 1; + } + + auto e0 = q.memset(darray, 0, N * sizeof(int)); + e0.wait(); + + auto e1 = q.submit([=](handler &cgh) { + cgh.single_task([=]() { + for (int i = 0; i < N; i++) { + sarray[i] += darray[i] + harray[i]; + } + }); + }); + + e1.wait(); + + for (int i = 0; i < N; i++) { + if (sarray[i] != MAGIC_NUM) { + return -2; + } + } + free(darray, ctxt); + free(sarray, ctxt); + free(harray, ctxt); + + return 0; +} diff --git a/SYCL/Basic/usm/mixed2.cpp b/SYCL/Basic/usm/mixed2.cpp new file mode 100644 index 0000000000..72c15cf055 --- /dev/null +++ b/SYCL/Basic/usm/mixed2.cpp @@ -0,0 +1,79 @@ +// XFAIL: cuda +// piextUSM*Alloc functions for CUDA are not behaving as described in +// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc +// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out +// RUN: %HOST_RUN_PLACEHOLDER %t1.out +// RUN: %CPU_RUN_PLACEHOLDER %t1.out +// RUN: %GPU_RUN_PLACEHOLDER %t1.out + +//==------------------- mixed2.cpp - Mixed Memory test ---------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cl::sycl; + +class foo; +int main() { + const int N = 4; + const int MAGIC_NUM = 42; + + queue q; + auto dev = q.get_device(); + auto ctxt = q.get_context(); + + if (!(dev.get_info() && + dev.get_info() && + dev.get_info())) + return 0; + + int *darray = (int *)malloc(N * sizeof(int), dev, ctxt, usm::alloc::device); + if (darray == nullptr) { + return -1; + } + int *sarray = (int *)malloc(N * sizeof(int), dev, ctxt, usm::alloc::shared); + + if (sarray == nullptr) { + return -1; + } + + int *harray = (int *)malloc(N * sizeof(int), dev, ctxt, usm::alloc::host); + if (harray == nullptr) { + return -1; + } + for (int i = 0; i < N; i++) { + sarray[i] = MAGIC_NUM - 1; + harray[i] = 1; + } + + auto e0 = q.memset(darray, 0, N * sizeof(int)); + e0.wait(); + + auto e1 = q.submit([=](handler &cgh) { + cgh.single_task([=]() { + for (int i = 0; i < N; i++) { + sarray[i] += darray[i] + harray[i]; + } + }); + }); + + e1.wait(); + + for (int i = 0; i < N; i++) { + if (sarray[i] != MAGIC_NUM) { + return -2; + } + } + free(darray, ctxt); + free(sarray, ctxt); + free(harray, ctxt); + + return 0; +} diff --git a/SYCL/Basic/usm/mixed2template.cpp b/SYCL/Basic/usm/mixed2template.cpp new file mode 100644 index 0000000000..7add1dcb33 --- /dev/null +++ b/SYCL/Basic/usm/mixed2template.cpp @@ -0,0 +1,92 @@ +// XFAIL: cuda +// piextUSM*Alloc functions for CUDA are not behaving as described in +// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc +// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out +// RUN: %HOST_RUN_PLACEHOLDER %t1.out +// RUN: %CPU_RUN_PLACEHOLDER %t1.out +// RUN: %GPU_RUN_PLACEHOLDER %t1.out + +//==---------- mixed2template.cpp - Mixed Memory with Templatestest --------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cl::sycl; + +class foo; +int main() { + const int N = 4; + const int MAGIC_NUM = 42; + + queue q; + auto dev = q.get_device(); + auto ctxt = q.get_context(); + + if (!(dev.get_info() && + dev.get_info() && + dev.get_info())) + return 0; + + int *darray = malloc(N, dev, ctxt, usm::alloc::device); + if (darray == nullptr) { + return -1; + } + int *sarray = malloc(N, dev, ctxt, usm::alloc::shared); + + if (sarray == nullptr) { + return -1; + } + + int *harray = malloc(N, dev, ctxt, usm::alloc::host); + if (harray == nullptr) { + return -1; + } + for (int i = 0; i < N; i++) { + sarray[i] = MAGIC_NUM - 1; + harray[i] = 1; + } + + auto e0 = q.memset(darray, 0, N * sizeof(int)); + e0.wait(); + + auto e1 = q.submit([=](handler &cgh) { + cgh.single_task([=]() { + for (int i = 0; i < N; i++) { + sarray[i] += darray[i] + harray[i]; + } + }); + }); + + e1.wait(); + + for (int i = 0; i < N; i++) { + if (sarray[i] != MAGIC_NUM) { + return -2; + } + } + free(darray, ctxt); + free(sarray, ctxt); + free(harray, ctxt); + + float *hfarray = malloc(N, q, usm::alloc::host); + if (hfarray == nullptr) + return -3; + + free(hfarray, ctxt); + + double *sdarray = + aligned_alloc(alignof(double), N, q, usm::alloc::shared); + if (sdarray == nullptr) + return -4; + + free(sdarray, ctxt); + + return 0; +} diff --git a/SYCL/Basic/usm/mixed_queue.cpp b/SYCL/Basic/usm/mixed_queue.cpp new file mode 100644 index 0000000000..c4174dd508 --- /dev/null +++ b/SYCL/Basic/usm/mixed_queue.cpp @@ -0,0 +1,108 @@ +// XFAIL: cuda +// piextUSM*Alloc functions for CUDA are not behaving as described in +// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc +// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out +// RUN: %HOST_RUN_PLACEHOLDER %t1.out +// RUN: %CPU_RUN_PLACEHOLDER %t1.out +// RUN: %GPU_RUN_PLACEHOLDER %t1.out + +//==-------------- mixed_queue.cpp - Mixed Memory test ---------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cl::sycl; + +class foo; +int main() { + const int N = 4; + const int MAGIC_NUM = 42; + const int SIZE = N * sizeof(int); + queue q; + auto dev = q.get_device(); + if (!(dev.get_info() && + dev.get_info() && + dev.get_info())) + return 0; + + int *ptr = (int *)malloc_device(SIZE, q); + if (ptr == nullptr) { + return -1; + } + free(ptr, q); + + ptr = (int *)malloc(SIZE, q, usm::alloc::device); + if (ptr == nullptr) { + return -1; + } + free(ptr, q); + + ptr = (int *)aligned_alloc_device(alignof(int), SIZE, q); + if (ptr == nullptr) { + return -1; + } + free(ptr, q); + + ptr = (int *)aligned_alloc(alignof(int), SIZE, q, usm::alloc::device); + if (ptr == nullptr) { + return -1; + } + free(ptr, q); + + ptr = (int *)malloc_shared(SIZE, q); + if (ptr == nullptr) { + return -1; + } + free(ptr, q); + + ptr = (int *)malloc(SIZE, q, usm::alloc::shared); + if (ptr == nullptr) { + return -1; + } + free(ptr, q); + + ptr = (int *)aligned_alloc_shared(alignof(int), SIZE, q); + if (ptr == nullptr) { + return -1; + } + free(ptr, q); + + ptr = (int *)aligned_alloc(alignof(int), SIZE, q, usm::alloc::shared); + if (ptr == nullptr) { + return -1; + } + free(ptr, q); + + ptr = (int *)malloc_host(SIZE, q); + if (ptr == nullptr) { + return -1; + } + free(ptr, q); + + ptr = (int *)malloc(SIZE, q, usm::alloc::host); + if (ptr == nullptr) { + return -1; + } + free(ptr, q); + + ptr = (int *)aligned_alloc_host(alignof(int), SIZE, q); + if (ptr == nullptr) { + return -1; + } + free(ptr, q); + + ptr = (int *)aligned_alloc(alignof(int), SIZE, q, usm::alloc::host); + if (ptr == nullptr) { + return -1; + } + free(ptr, q); + + return 0; +} diff --git a/SYCL/Basic/usm/multictxt.cpp b/SYCL/Basic/usm/multictxt.cpp new file mode 100644 index 0000000000..59536945ed --- /dev/null +++ b/SYCL/Basic/usm/multictxt.cpp @@ -0,0 +1,66 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out +// RUN: %t1.out + +// REQUIRES: cpu, gpu + +//==----------------- multictxt.cpp - Multi Context USM test ---------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cl::sycl; + +// The multictxt test here is a sanity check that USM selects the right +// implementation when presented with multiple contexts. The extra context +// only needs to exist for this test to do its job. + +void GpuCpuCpu() { + queue gpu_q(gpu_selector{}); + queue cpu_q(cpu_selector{}); + device dev = cpu_q.get_device(); + context ctx = cpu_q.get_context(); + if (dev.get_info()) { + void *ptr = malloc_shared(128, dev, ctx); + + free(ptr, ctx); + } +} + +void CpuGpuGpu() { + queue cpu_q(cpu_selector{}); + queue gpu_q(gpu_selector{}); + device dev = gpu_q.get_device(); + context ctx = gpu_q.get_context(); + + if (dev.get_info()) { + void *ptr = malloc_shared(128, dev, ctx); + + free(ptr, ctx); + } +} + +void GpuCpuGpu() { + queue gpu_q(gpu_selector{}); + queue cpu_q(cpu_selector{}); + device dev = gpu_q.get_device(); + context ctx = gpu_q.get_context(); + + if (dev.get_info()) { + void *ptr = malloc_shared(128, dev, ctx); + + free(ptr, ctx); + } +} + +int main() { + GpuCpuCpu(); + CpuGpuGpu(); + GpuCpuGpu(); + + return 0; +} diff --git a/SYCL/Basic/usm/pfor_flatten.cpp b/SYCL/Basic/usm/pfor_flatten.cpp new file mode 100644 index 0000000000..c629a143b9 --- /dev/null +++ b/SYCL/Basic/usm/pfor_flatten.cpp @@ -0,0 +1,71 @@ +// UNSUPPORTED: cuda +// CUDA does not support the unnamed lambda extension. +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-unnamed-lambda %s -o %t1.out +// RUN: %HOST_RUN_PLACEHOLDER %t1.out +// RUN: %CPU_RUN_PLACEHOLDER %t1.out +// RUN: %GPU_RUN_PLACEHOLDER %t1.out + +//==--------------- pfor_flatten.cpp - Kernel Launch Flattening test -------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cl::sycl; + +class foo; +int main() { + int *array = nullptr; + const int N = 42; + const int MAGIC_NUM = 42; + + queue q; + auto ctxt = q.get_context(); + + array = (int *)malloc_host(N * sizeof(int), q); + if (array == nullptr) { + return -1; + } + + range<1> R{N}; + auto e1 = q.parallel_for(R, [=](id<1> ID) { + int i = ID[0]; + array[i] = MAGIC_NUM-4; + }); + + + auto e2 = q.parallel_for(R, e1, [=](id<1> ID) { + int i = ID[0]; + array[i] += 2; + }); + + auto e3 = + q.parallel_for(nd_range<1>{R, range<1>{1}}, {e1, e2}, [=](nd_item<1> ID) { + int i = ID.get_global_id(0); + array[i]++; + }); + + auto e4 = q.single_task({e3}, [=]() { + for (int i = 0; i < N; i++) { + array[i]++; + } + }); + + q.single_task(e4, [=]() { array[0] = array[0]; }); + + q.wait(); + + for (int i = 0; i < N; i++) { + if (array[i] != MAGIC_NUM) { + return -1; + } + } + free(array, ctxt); + + return 0; +} diff --git a/SYCL/Basic/usm/pointer_query.cpp b/SYCL/Basic/usm/pointer_query.cpp new file mode 100644 index 0000000000..87ab37dbef --- /dev/null +++ b/SYCL/Basic/usm/pointer_query.cpp @@ -0,0 +1,123 @@ +// RUN: %clangxx -fsycl %s -o %t1.out +// RUN: %HOST_RUN_PLACEHOLDER %t1.out + +//==-------------- pointer_query.cpp - Pointer Query test ------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cl::sycl; + +int main() { + int *array = nullptr; + const int N = 4; + queue q; + auto dev = q.get_device(); + auto ctxt = q.get_context(); + + if (!(dev.get_info() && + dev.get_info() && + dev.get_info())) + return 0; + + usm::alloc Kind; + device D; + + // Test device allocs + array = (int *)malloc_device(N * sizeof(int), q); + if (array == nullptr) { + return 1; + } + Kind = get_pointer_type(array, ctxt); + if (ctxt.is_host()) { + // for now, host device treats all allocations + // as host allocations + if (Kind != usm::alloc::host) { + return 2; + } + } else { + if (Kind != usm::alloc::device) { + return 3; + } + } + D = get_pointer_device(array, ctxt); + if (D != dev) { + return 4; + } + free(array, ctxt); + + // Test shared allocs + array = (int *)malloc_shared(N * sizeof(int), q); + if (array == nullptr) { + return 5; + } + Kind = get_pointer_type(array, ctxt); + if (ctxt.is_host()) { + // for now, host device treats all allocations + // as host allocations + if (Kind != usm::alloc::host) { + return 6; + } + } else { + if (Kind != usm::alloc::shared) { + return 7; + } + } + D = get_pointer_device(array, ctxt); + if (D != dev) { + return 8; + } + free(array, ctxt); + + // Test host allocs + array = (int *)malloc_host(N * sizeof(int), q); + if (array == nullptr) { + return 9; + } + Kind = get_pointer_type(array, ctxt); + if (Kind != usm::alloc::host) { + return 10; + } + D = get_pointer_device(array, ctxt); + auto Devs = ctxt.get_devices(); + auto result = std::find(Devs.begin(), Devs.end(), D); + if (result == Devs.end()) { + // Returned device was not in queried context + return 11; + } + free(array, ctxt); + + // Test invalid ptrs + Kind = get_pointer_type(nullptr, ctxt); + if (Kind != usm::alloc::unknown) { + return 11; + } + + // next checks only valid for non-host contexts + array = (int*)malloc(N*sizeof(int)); + Kind = get_pointer_type(array, ctxt); + if (!ctxt.is_host()) { + if (Kind != usm::alloc::unknown) { + return 12; + } + try { + D = get_pointer_device(array, ctxt); + } catch (runtime_error) { + return 0; + } + return 13; + } else { + // host ctxts always report host + if (Kind != usm::alloc::host) { + return 14; + } + } + free(array); + + return 0; +} diff --git a/SYCL/Basic/usm/prefetch.cpp b/SYCL/Basic/usm/prefetch.cpp new file mode 100644 index 0000000000..a92786a055 --- /dev/null +++ b/SYCL/Basic/usm/prefetch.cpp @@ -0,0 +1,69 @@ +//==---- prefetch.cpp - USM prefetch test ----------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out +// RUN: %HOST_RUN_PLACEHOLDER %t1.out +// RUN: %CPU_RUN_PLACEHOLDER %t1.out + +#include + +using namespace cl::sycl; + +static constexpr int count = 100; + +int main() { + queue q([](exception_list el) { + for (auto &e : el) + throw e; + }); + if (q.get_device().get_info()) { + float *src = (float *)malloc_shared(sizeof(float) * count, q.get_device(), + q.get_context()); + float *dest = (float *)malloc_shared(sizeof(float) * count, q.get_device(), + q.get_context()); + for (int i = 0; i < count; i++) + src[i] = i; + + // Test handler::prefetch + { + event init_prefetch = q.submit( + [&](handler &cgh) { cgh.prefetch(src, sizeof(float) * count); }); + + q.submit([&](handler &cgh) { + cgh.depends_on(init_prefetch); + cgh.single_task([=]() { + for (int i = 0; i < count; i++) + dest[i] = 2 * src[i]; + }); + }); + q.wait_and_throw(); + + for (int i = 0; i < count; i++) { + assert(dest[i] == i * 2); + } + } + + // Test queue::prefetch + { + event init_prefetch = q.prefetch(src, sizeof(float) * count); + + q.submit([&](handler &cgh) { + cgh.depends_on(init_prefetch); + cgh.single_task([=]() { + for (int i = 0; i < count; i++) + dest[i] = 3 * src[i]; + }); + }); + q.wait_and_throw(); + + for (int i = 0; i < count; i++) { + assert(dest[i] == i * 3); + } + } + } + return 0; +} diff --git a/SYCL/Basic/usm/queue_wait.cpp b/SYCL/Basic/usm/queue_wait.cpp new file mode 100644 index 0000000000..692c7f43f5 --- /dev/null +++ b/SYCL/Basic/usm/queue_wait.cpp @@ -0,0 +1,48 @@ +// XFAIL: cuda +// piextUSM*Alloc functions for CUDA are not behaving as described in +// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc +// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +#include + +#include +#include + +using namespace cl::sycl; + +// This test checks that queue USM functions are properly waited for during +// calls to queue::wait(). + +int main() { + const std::size_t Size = 32; + queue Q; + std::cout << Q.is_host() << std::endl; + device Dev = Q.get_device(); + context Ctx = Q.get_context(); + if (!(Dev.get_info() && + Dev.get_info())) + return 0; + + unsigned char *DevArr = (unsigned char *)malloc_device(Size, Dev, Ctx); + assert(DevArr); + unsigned char *HostArr = (unsigned char *)malloc_host(Size, Ctx); + assert(HostArr); + + Q.memset(DevArr, 42, Size); + Q.wait(); + Q.memcpy(HostArr, DevArr, Size); + Q.wait(); + + for (std::size_t i = 0; i < Size; ++i) + assert(HostArr[i] == 42); + + free(DevArr, Ctx); + free(HostArr, Ctx); + + return 0; +} diff --git a/SYCL/Basic/usm/smemll.cpp b/SYCL/Basic/usm/smemll.cpp new file mode 100644 index 0000000000..eff0429287 --- /dev/null +++ b/SYCL/Basic/usm/smemll.cpp @@ -0,0 +1,86 @@ +// XFAIL: cuda +// piextUSM*Alloc functions for CUDA are not behaving as described in +// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc +// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out +// RUN: %HOST_RUN_PLACEHOLDER %t1.out +// RUN: %CPU_RUN_PLACEHOLDER %t1.out +// RUN: %GPU_RUN_PLACEHOLDER %t1.out + +//==------------------- smemll.cpp - Shared Memory Linked List test --------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cl::sycl; + +int numNodes = 4; + +struct Node { + Node() : pNext(nullptr), Num(0xDEADBEEF) {} + + Node *pNext; + uint32_t Num; +}; + +class foo; +int main() { + queue q; + auto dev = q.get_device(); + auto ctxt = q.get_context(); + + if (!dev.get_info()) + return 0; + + Node *s_head = (Node *)malloc_shared(sizeof(Node), dev, ctxt); + if (s_head == nullptr) { + return -1; + } + Node *s_cur = s_head; + + for (int i = 0; i < numNodes; i++) { + s_cur->Num = i * 2; + + if (i != (numNodes - 1)) { + s_cur->pNext = (Node *)malloc_shared(sizeof(Node), dev, ctxt); + if (s_cur->pNext == nullptr) { + return -1; + } + } else { + s_cur->pNext = nullptr; + } + + s_cur = s_cur->pNext; + } + + auto e1 = q.submit([=](handler &cgh) { + cgh.single_task([=]() { + Node *pHead = s_head; + while (pHead) { + pHead->Num = pHead->Num * 2 + 1; + pHead = pHead->pNext; + } + }); + }); + + e1.wait(); + + s_cur = s_head; + for (int i = 0; i < numNodes; i++) { + const int want = i * 4 + 1; + if (s_cur->Num != want) { + return -2; + } + Node *old = s_cur; + s_cur = s_cur->pNext; + free(old, ctxt); + } + + return 0; +} diff --git a/SYCL/Basic/usm/smemllaligned.cpp b/SYCL/Basic/usm/smemllaligned.cpp new file mode 100644 index 0000000000..6e7ec35400 --- /dev/null +++ b/SYCL/Basic/usm/smemllaligned.cpp @@ -0,0 +1,83 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out +// RUN: %HOST_RUN_PLACEHOLDER %t1.out +// RUN: %CPU_RUN_PLACEHOLDER %t1.out +// RUN: %GPU_RUN_PLACEHOLDER %t1.out + +//==---- smemllaligned.cpp - Aligned Shared Memory Linked List test --------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +using namespace cl::sycl; + +int numNodes = 4; + +struct Node { + Node() : pNext(nullptr), Num(0xDEADBEEF) {} + + Node *pNext; + uint32_t Num; +}; + +class foo; +int main() { + queue q; + auto dev = q.get_device(); + auto ctxt = q.get_context(); + + if (!dev.get_info()) + return 0; + + Node *s_head = + (Node *)aligned_alloc_shared(alignof(Node), sizeof(Node), dev, ctxt); + if (s_head == nullptr) { + return -1; + } + Node *s_cur = s_head; + + for (int i = 0; i < numNodes; i++) { + s_cur->Num = i * 2; + + if (i != (numNodes - 1)) { + s_cur->pNext = + (Node *)aligned_alloc_shared(alignof(Node), sizeof(Node), dev, ctxt); + if (s_cur->pNext == nullptr) { + return -1; + } + } else { + s_cur->pNext = nullptr; + } + + s_cur = s_cur->pNext; + } + + auto e1 = q.submit([=](handler &cgh) { + cgh.single_task([=]() { + Node *pHead = s_head; + while (pHead) { + pHead->Num = pHead->Num * 2 + 1; + pHead = pHead->pNext; + } + }); + }); + + e1.wait(); + + s_cur = s_head; + for (int i = 0; i < numNodes; i++) { + const int want = i * 4 + 1; + if (s_cur->Num != want) { + return -2; + } + Node *old = s_cur; + s_cur = s_cur->pNext; + free(old, ctxt); + } + + return 0; +} diff --git a/SYCL/CMakeLists.txt b/SYCL/CMakeLists.txt new file mode 100644 index 0000000000..a6694ae1f4 --- /dev/null +++ b/SYCL/CMakeLists.txt @@ -0,0 +1,5 @@ +add_subdirectory(Basic) +#add_subdirectory(External) +#add_subdirectory(MultiSource) +#add_subdirectory(Parallel) +#add_subdirectory(SingleSource) diff --git a/SYCL/README.md b/SYCL/README.md new file mode 100644 index 0000000000..cd84a0dc6d --- /dev/null +++ b/SYCL/README.md @@ -0,0 +1,7 @@ +SYCL-related tests directory. + + - Basic - tests used for sanity testing. Building, executing and checks are defined using insource comments with LIT syntax. + - External - contains infrastructure for running tests which sources are stored outside of this repository + - MultiSource - SYCL related tests which depend on multiple source file. + - SingleSource - SYCL tests with single source file. + - Parallel - Tests which produce high-parallel load on taret device. It is recommended to run such tests in 1 thread. diff --git a/cmake/caches/clang_fsycl.cmake b/cmake/caches/clang_fsycl.cmake new file mode 100644 index 0000000000..b35fcf023d --- /dev/null +++ b/cmake/caches/clang_fsycl.cmake @@ -0,0 +1,4 @@ +# Default open source clang configuration with SYCL support. + +set(CMAKE_BUILD_TYPE "Release" CACHE STRING "") +set(CMAKE_CXX_FLAGS "-fsycl" CACHE STRING "") diff --git a/cmake/caches/clang_fsycl_cuda.cmake b/cmake/caches/clang_fsycl_cuda.cmake new file mode 100644 index 0000000000..549f426ab0 --- /dev/null +++ b/cmake/caches/clang_fsycl_cuda.cmake @@ -0,0 +1,4 @@ +# Default open source clang configuration with SYCL support. + +set(CMAKE_BUILD_TYPE "Release" CACHE STRING "") +set(CMAKE_CXX_FLAGS "-fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice -Xsycl-target-backend --cuda-gpu-arch=sm_32" CACHE STRING "") diff --git a/cmake/caches/dpcpp.cmake b/cmake/caches/dpcpp.cmake new file mode 100644 index 0000000000..1e31ebbfcc --- /dev/null +++ b/cmake/caches/dpcpp.cmake @@ -0,0 +1,5 @@ +# Default dpcpp compiler configuration. + +# No extra command line arguments are needed to support SYCL +set(CMAKE_BUILD_TYPE "Release" CACHE STRING "") +set(SYCL_CXX_FLAGS "" CACHE STRING "")