From a34975d5a1fde659b44ce3d378b93bfa24916f1f Mon Sep 17 00:00:00 2001 From: "Meterelliyoz, Mesut" Date: Mon, 18 May 2020 22:09:11 -0700 Subject: [PATCH 1/5] Enabling Unified Shared Memory (USM) interfaces --- cmake/FindMKL.cmake | 3 +- docs/domains/blas/asum.rst | 112 +- docs/domains/blas/axpy.rst | 134 +- docs/domains/blas/axpy_batch.rst | 160 + .../blas/blas-level-1-routines.inc.rst | 92 - docs/domains/blas/blas-level-1-routines.rst | 76 + .../blas/blas-level-2-routines.inc.rst | 130 - docs/domains/blas/blas-level-2-routines.rst | 104 + .../blas/blas-level-3-routines.inc.rst | 90 - docs/domains/blas/blas-level-3-routines.rst | 56 + docs/domains/blas/blas-like-extensions.rst | 47 + docs/domains/blas/blas.rst | 13 +- docs/domains/blas/copy.rst | 123 +- docs/domains/blas/dot.rst | 131 +- docs/domains/blas/dotc.rst | 120 +- docs/domains/blas/dotu.rst | 124 +- docs/domains/blas/gbmv.rst | 166 +- docs/domains/blas/gemm.rst | 195 +- docs/domains/blas/gemm_batch.rst | 514 ++- docs/domains/blas/gemm_ext.rst | 348 +- docs/domains/blas/gemmt.rst | 222 +- docs/domains/blas/gemv.rst | 166 +- docs/domains/blas/ger.rst | 144 +- docs/domains/blas/gerc.rst | 144 +- docs/domains/blas/geru.rst | 145 +- docs/domains/blas/hbmv.rst | 161 +- docs/domains/blas/hemm.rst | 209 +- docs/domains/blas/hemv.rst | 155 +- docs/domains/blas/her.rst | 151 +- docs/domains/blas/her2.rst | 163 +- docs/domains/blas/her2k.rst | 183 +- docs/domains/blas/herk.rst | 172 +- docs/domains/blas/hpmv.rst | 155 +- docs/domains/blas/hpr.rst | 153 +- docs/domains/blas/hpr2.rst | 166 +- docs/domains/blas/iamax.rst | 113 +- docs/domains/blas/iamin.rst | 107 +- docs/domains/blas/nrm2.rst | 116 +- docs/domains/blas/rot.rst | 138 +- docs/domains/blas/rotg.rst | 118 +- docs/domains/blas/rotm.rst | 181 +- docs/domains/blas/rotmg.rst | 168 +- docs/domains/blas/sbmv.rst | 159 +- docs/domains/blas/scal.rst | 113 +- docs/domains/blas/sdsdot.rst | 129 +- docs/domains/blas/spmv.rst | 149 +- docs/domains/blas/spr.rst | 147 +- docs/domains/blas/spr2.rst | 153 +- docs/domains/blas/swap.rst | 135 +- docs/domains/blas/symm.rst | 200 +- docs/domains/blas/symv.rst | 151 +- docs/domains/blas/syr.rst | 147 +- docs/domains/blas/syr2.rst | 159 +- docs/domains/blas/syr2k.rst | 187 +- docs/domains/blas/syrk.rst | 164 +- docs/domains/blas/tbmv.rst | 157 +- docs/domains/blas/tbsv.rst | 156 +- docs/domains/blas/tpmv.rst | 147 +- docs/domains/blas/tpsv.rst | 147 +- docs/domains/blas/trmm.rst | 196 +- docs/domains/blas/trmv.rst | 151 +- docs/domains/blas/trsm.rst | 193 +- docs/domains/blas/trsm_batch.rst | 335 +- docs/domains/blas/trsv.rst | 151 +- include/onemkl/blas/blas.hpp | 2071 +++++++++- .../onemkl/blas/detail/blas_ct_templates.hpp | 2089 ++++++++++ include/onemkl/blas/detail/blas_loader.hpp | 861 ++++- include/onemkl/blas/detail/cublas/blas_ct.hpp | 3127 +++++++++------ .../blas/detail/cublas/onemkl_blas_cublas.hpp | 847 ++++- include/onemkl/blas/detail/mklcpu/blas_ct.hpp | 3127 +++++++++------ .../blas/detail/mklcpu/onemkl_blas_mklcpu.hpp | 960 ++++- include/onemkl/blas/detail/mklgpu/blas_ct.hpp | 3125 +++++++++------ .../blas/detail/mklgpu/onemkl_blas_mklgpu.hpp | 984 ++++- include/onemkl/blas/predicates.hpp | 3355 +++++++++++++++-- src/blas/backends/cublas/cublas_batch.cpp | 213 +- .../backends/cublas/cublas_extensions.cpp | 61 +- src/blas/backends/cublas/cublas_helper.hpp | 2 +- src/blas/backends/cublas/cublas_level1.cpp | 268 ++ src/blas/backends/cublas/cublas_level2.cpp | 505 +++ src/blas/backends/cublas/cublas_level3.cpp | 219 ++ .../cublas/mkl_blas_cublas_wrappers.cpp | 174 +- src/blas/backends/mklcpu/cpu_batch.cpp | 1244 +++--- src/blas/backends/mklcpu/cpu_extensions.cpp | 102 + src/blas/backends/mklcpu/cpu_level1.cpp | 820 ++++ src/blas/backends/mklcpu/cpu_level2.cpp | 1299 +++++++ src/blas/backends/mklcpu/cpu_level3.cpp | 692 ++++ .../backends/mklcpu/mkl_blas_cpu_wrappers.cpp | 174 +- src/blas/backends/mklgpu/CMakeLists.txt | 2 +- .../backends/mklgpu/mkl_blas_gpu_wrappers.cpp | 174 +- .../backends/mklgpu/mkl_blas_sycl_buffer.cpp | 102 - .../backends/mklgpu/mkl_blas_sycl_usm.cpp | 1332 +++++++ .../mklgpu/mkl_internal_blas_gpu_wrappers.cpp | 2590 +++++++++---- .../mklgpu/mkl_internal_blas_gpu_wrappers.hpp | 1099 +++++- .../mklgpu/mkl_internal_blas_sycl_gpu.hpp | 976 ++++- src/blas/blas_loader.cpp | 1419 ++++++- src/blas/function_table.hpp | 910 ++++- src/include/exceptions_helper.hpp | 34 + tests/unit_tests/CMakeLists.txt | 2 + tests/unit_tests/blas/batch/CMakeLists.txt | 2 +- .../unit_tests/blas/batch/axpy_batch_usm.cpp | 239 ++ tests/unit_tests/blas/batch/gemm_batch.cpp | 308 -- .../blas/batch/gemm_batch_stride.cpp | 23 +- .../blas/batch/gemm_batch_stride_usm.cpp | 228 ++ .../unit_tests/blas/batch/gemm_batch_usm.cpp | 370 ++ tests/unit_tests/blas/batch/trsm_batch.cpp | 297 -- .../blas/batch/trsm_batch_stride.cpp | 23 +- .../unit_tests/blas/extensions/CMakeLists.txt | 2 +- tests/unit_tests/blas/extensions/gemm_ext.cpp | 100 +- .../blas/extensions/gemm_ext_off.cpp | 54 +- tests/unit_tests/blas/extensions/gemmt.cpp | 269 +- .../unit_tests/blas/extensions/gemmt_usm.cpp | 289 ++ tests/unit_tests/blas/include/test_common.hpp | 37 + tests/unit_tests/blas/level1/CMakeLists.txt | 2 +- tests/unit_tests/blas/level1/asum.cpp | 39 +- tests/unit_tests/blas/level1/asum_usm.cpp | 144 + tests/unit_tests/blas/level1/axpy.cpp | 36 +- tests/unit_tests/blas/level1/axpy_usm.cpp | 145 + tests/unit_tests/blas/level1/copy.cpp | 36 +- tests/unit_tests/blas/level1/copy_usm.cpp | 139 + tests/unit_tests/blas/level1/dot.cpp | 30 +- tests/unit_tests/blas/level1/dot_usm.cpp | 136 + tests/unit_tests/blas/level1/dotc.cpp | 24 +- tests/unit_tests/blas/level1/dotc_usm.cpp | 134 + tests/unit_tests/blas/level1/dotu.cpp | 24 +- tests/unit_tests/blas/level1/dotu_usm.cpp | 133 + tests/unit_tests/blas/level1/iamax.cpp | 36 +- tests/unit_tests/blas/level1/iamax_usm.cpp | 139 + tests/unit_tests/blas/level1/iamin.cpp | 36 +- tests/unit_tests/blas/level1/iamin_usm.cpp | 139 + tests/unit_tests/blas/level1/nrm2.cpp | 36 +- tests/unit_tests/blas/level1/nrm2_usm.cpp | 140 + tests/unit_tests/blas/level1/rot.cpp | 36 +- tests/unit_tests/blas/level1/rot_usm.cpp | 150 + tests/unit_tests/blas/level1/rotg.cpp | 57 +- tests/unit_tests/blas/level1/rotg_usm.cpp | 158 + tests/unit_tests/blas/level1/rotm.cpp | 60 +- tests/unit_tests/blas/level1/rotm_usm.cpp | 161 + tests/unit_tests/blas/level1/rotmg.cpp | 16 +- tests/unit_tests/blas/level1/rotmg_usm.cpp | 141 + tests/unit_tests/blas/level1/scal.cpp | 39 +- tests/unit_tests/blas/level1/scal_usm.cpp | 153 + tests/unit_tests/blas/level1/sdsdot.cpp | 18 +- tests/unit_tests/blas/level1/sdsdot_usm.cpp | 126 + tests/unit_tests/blas/level1/swap.cpp | 36 +- tests/unit_tests/blas/level1/swap_usm.cpp | 141 + tests/unit_tests/blas/level2/CMakeLists.txt | 2 +- tests/unit_tests/blas/level2/gbmv.cpp | 114 +- tests/unit_tests/blas/level2/gbmv_usm.cpp | 205 + tests/unit_tests/blas/level2/gemv.cpp | 115 +- tests/unit_tests/blas/level2/gemv_usm.cpp | 204 + tests/unit_tests/blas/level2/ger.cpp | 24 +- tests/unit_tests/blas/level2/ger_usm.cpp | 136 + tests/unit_tests/blas/level2/gerc.cpp | 24 +- tests/unit_tests/blas/level2/gerc_usm.cpp | 136 + tests/unit_tests/blas/level2/geru.cpp | 24 +- tests/unit_tests/blas/level2/geru_usm.cpp | 136 + tests/unit_tests/blas/level2/hbmv.cpp | 42 +- tests/unit_tests/blas/level2/hbmv_usm.cpp | 159 + tests/unit_tests/blas/level2/hemv.cpp | 38 +- tests/unit_tests/blas/level2/hemv_usm.cpp | 158 + tests/unit_tests/blas/level2/her.cpp | 36 +- tests/unit_tests/blas/level2/her2.cpp | 50 +- tests/unit_tests/blas/level2/her2_usm.cpp | 155 + tests/unit_tests/blas/level2/her_usm.cpp | 153 + tests/unit_tests/blas/level2/hpmv.cpp | 46 +- tests/unit_tests/blas/level2/hpmv_usm.cpp | 156 + tests/unit_tests/blas/level2/hpr.cpp | 42 +- tests/unit_tests/blas/level2/hpr2.cpp | 40 +- tests/unit_tests/blas/level2/hpr2_usm.cpp | 145 + tests/unit_tests/blas/level2/hpr_usm.cpp | 153 + tests/unit_tests/blas/level2/sbmv.cpp | 40 +- tests/unit_tests/blas/level2/sbmv_usm.cpp | 148 + tests/unit_tests/blas/level2/spmv.cpp | 38 +- tests/unit_tests/blas/level2/spmv_usm.cpp | 144 + tests/unit_tests/blas/level2/spr.cpp | 36 +- tests/unit_tests/blas/level2/spr2.cpp | 36 +- tests/unit_tests/blas/level2/spr2_usm.cpp | 141 + tests/unit_tests/blas/level2/spr_usm.cpp | 139 + tests/unit_tests/blas/level2/symv.cpp | 38 +- tests/unit_tests/blas/level2/symv_usm.cpp | 145 + tests/unit_tests/blas/level2/syr.cpp | 36 +- tests/unit_tests/blas/level2/syr2.cpp | 38 +- tests/unit_tests/blas/level2/syr2_usm.cpp | 142 + tests/unit_tests/blas/level2/syr_usm.cpp | 140 + tests/unit_tests/blas/level2/tbmv.cpp | 218 +- tests/unit_tests/blas/level2/tbmv_usm.cpp | 237 ++ tests/unit_tests/blas/level2/tbsv.cpp | 218 +- tests/unit_tests/blas/level2/tbsv_usm.cpp | 237 ++ tests/unit_tests/blas/level2/tpmv.cpp | 186 +- tests/unit_tests/blas/level2/tpmv_usm.cpp | 220 ++ tests/unit_tests/blas/level2/tpsv.cpp | 186 +- tests/unit_tests/blas/level2/tpsv_usm.cpp | 220 ++ tests/unit_tests/blas/level2/trmv.cpp | 208 +- tests/unit_tests/blas/level2/trmv_usm.cpp | 232 ++ tests/unit_tests/blas/level2/trsv.cpp | 208 +- tests/unit_tests/blas/level2/trsv_usm.cpp | 232 ++ tests/unit_tests/blas/level3/CMakeLists.txt | 2 +- tests/unit_tests/blas/level3/gemm.cpp | 158 +- tests/unit_tests/blas/level3/gemm_usm.cpp | 220 ++ tests/unit_tests/blas/level3/hemm.cpp | 46 +- tests/unit_tests/blas/level3/hemm_usm.cpp | 154 + tests/unit_tests/blas/level3/her2k.cpp | 62 +- tests/unit_tests/blas/level3/her2k_usm.cpp | 160 + tests/unit_tests/blas/level3/herk.cpp | 62 +- tests/unit_tests/blas/level3/herk_usm.cpp | 158 + tests/unit_tests/blas/level3/symm.cpp | 78 +- tests/unit_tests/blas/level3/symm_usm.cpp | 178 + tests/unit_tests/blas/level3/syr2k.cpp | 92 +- tests/unit_tests/blas/level3/syr2k_usm.cpp | 183 + tests/unit_tests/blas/level3/syrk.cpp | 82 +- tests/unit_tests/blas/level3/syrk_usm.cpp | 178 + tests/unit_tests/blas/level3/trmm.cpp | 304 +- tests/unit_tests/blas/level3/trmm_usm.cpp | 287 ++ tests/unit_tests/blas/level3/trsm.cpp | 492 +-- tests/unit_tests/blas/level3/trsm_usm.cpp | 383 ++ tests/unit_tests/include/test_helper.hpp | 40 + 216 files changed, 48420 insertions(+), 11089 deletions(-) create mode 100644 docs/domains/blas/axpy_batch.rst delete mode 100644 docs/domains/blas/blas-level-1-routines.inc.rst create mode 100644 docs/domains/blas/blas-level-1-routines.rst delete mode 100644 docs/domains/blas/blas-level-2-routines.inc.rst create mode 100644 docs/domains/blas/blas-level-2-routines.rst delete mode 100644 docs/domains/blas/blas-level-3-routines.inc.rst create mode 100644 docs/domains/blas/blas-level-3-routines.rst create mode 100644 docs/domains/blas/blas-like-extensions.rst create mode 100644 include/onemkl/blas/detail/blas_ct_templates.hpp create mode 100644 src/blas/backends/mklgpu/mkl_blas_sycl_usm.cpp create mode 100644 src/include/exceptions_helper.hpp create mode 100644 tests/unit_tests/blas/batch/axpy_batch_usm.cpp delete mode 100644 tests/unit_tests/blas/batch/gemm_batch.cpp create mode 100644 tests/unit_tests/blas/batch/gemm_batch_stride_usm.cpp create mode 100644 tests/unit_tests/blas/batch/gemm_batch_usm.cpp delete mode 100644 tests/unit_tests/blas/batch/trsm_batch.cpp create mode 100644 tests/unit_tests/blas/extensions/gemmt_usm.cpp create mode 100644 tests/unit_tests/blas/level1/asum_usm.cpp create mode 100644 tests/unit_tests/blas/level1/axpy_usm.cpp create mode 100644 tests/unit_tests/blas/level1/copy_usm.cpp create mode 100644 tests/unit_tests/blas/level1/dot_usm.cpp create mode 100644 tests/unit_tests/blas/level1/dotc_usm.cpp create mode 100644 tests/unit_tests/blas/level1/dotu_usm.cpp create mode 100644 tests/unit_tests/blas/level1/iamax_usm.cpp create mode 100644 tests/unit_tests/blas/level1/iamin_usm.cpp create mode 100644 tests/unit_tests/blas/level1/nrm2_usm.cpp create mode 100644 tests/unit_tests/blas/level1/rot_usm.cpp create mode 100644 tests/unit_tests/blas/level1/rotg_usm.cpp create mode 100644 tests/unit_tests/blas/level1/rotm_usm.cpp create mode 100644 tests/unit_tests/blas/level1/rotmg_usm.cpp create mode 100644 tests/unit_tests/blas/level1/scal_usm.cpp create mode 100644 tests/unit_tests/blas/level1/sdsdot_usm.cpp create mode 100644 tests/unit_tests/blas/level1/swap_usm.cpp create mode 100644 tests/unit_tests/blas/level2/gbmv_usm.cpp create mode 100644 tests/unit_tests/blas/level2/gemv_usm.cpp create mode 100644 tests/unit_tests/blas/level2/ger_usm.cpp create mode 100644 tests/unit_tests/blas/level2/gerc_usm.cpp create mode 100644 tests/unit_tests/blas/level2/geru_usm.cpp create mode 100644 tests/unit_tests/blas/level2/hbmv_usm.cpp create mode 100644 tests/unit_tests/blas/level2/hemv_usm.cpp create mode 100644 tests/unit_tests/blas/level2/her2_usm.cpp create mode 100644 tests/unit_tests/blas/level2/her_usm.cpp create mode 100644 tests/unit_tests/blas/level2/hpmv_usm.cpp create mode 100644 tests/unit_tests/blas/level2/hpr2_usm.cpp create mode 100644 tests/unit_tests/blas/level2/hpr_usm.cpp create mode 100644 tests/unit_tests/blas/level2/sbmv_usm.cpp create mode 100644 tests/unit_tests/blas/level2/spmv_usm.cpp create mode 100644 tests/unit_tests/blas/level2/spr2_usm.cpp create mode 100644 tests/unit_tests/blas/level2/spr_usm.cpp create mode 100644 tests/unit_tests/blas/level2/symv_usm.cpp create mode 100644 tests/unit_tests/blas/level2/syr2_usm.cpp create mode 100644 tests/unit_tests/blas/level2/syr_usm.cpp create mode 100644 tests/unit_tests/blas/level2/tbmv_usm.cpp create mode 100644 tests/unit_tests/blas/level2/tbsv_usm.cpp create mode 100644 tests/unit_tests/blas/level2/tpmv_usm.cpp create mode 100644 tests/unit_tests/blas/level2/tpsv_usm.cpp create mode 100644 tests/unit_tests/blas/level2/trmv_usm.cpp create mode 100644 tests/unit_tests/blas/level2/trsv_usm.cpp create mode 100644 tests/unit_tests/blas/level3/gemm_usm.cpp create mode 100644 tests/unit_tests/blas/level3/hemm_usm.cpp create mode 100644 tests/unit_tests/blas/level3/her2k_usm.cpp create mode 100644 tests/unit_tests/blas/level3/herk_usm.cpp create mode 100644 tests/unit_tests/blas/level3/symm_usm.cpp create mode 100644 tests/unit_tests/blas/level3/syr2k_usm.cpp create mode 100644 tests/unit_tests/blas/level3/syrk_usm.cpp create mode 100644 tests/unit_tests/blas/level3/trmm_usm.cpp create mode 100644 tests/unit_tests/blas/level3/trsm_usm.cpp diff --git a/cmake/FindMKL.cmake b/cmake/FindMKL.cmake index c5656ab91..9a210fba7 100644 --- a/cmake/FindMKL.cmake +++ b/cmake/FindMKL.cmake @@ -66,6 +66,7 @@ if(UNIX) list(APPEND MKL_LINK_PREFIX "-L${MKL_LIB_DIR}") set(LIB_PREFIX "-l") set(OPENCL_LIBNAME "OpenCL") + set(SYCL_LINK_FLAGS "-fsycl") else() if(${BUILD_SHARED_LIBS}) set(MKL_COPT ${MKL_COPT} "-Donemkl_EXPORTS") @@ -84,7 +85,7 @@ if (ENABLE_MKLCPU_BACKEND OR ENABLE_MKLGPU_BACKEND) list(APPEND MKL_LINK_C ${TBB_LINK}) endif() if(ENABLE_MKLGPU_BACKEND) - set(MKL_LINK_SYCL ${MKL_LINK_PREFIX} ${LIB_PREFIX}${MKL_SYCL} ${MKL_LINK_C} ${LIB_PREFIX}${OPENCL_LIBNAME}) + set(MKL_LINK_SYCL ${MKL_LINK_PREFIX} ${LIB_PREFIX}${MKL_SYCL} ${MKL_LINK_C} ${LIB_PREFIX}${OPENCL_LIBNAME} ${SYCL_LINK_FLAGS}) endif() endif() diff --git a/docs/domains/blas/asum.rst b/docs/domains/blas/asum.rst index 8e7f53092..4a6092971 100644 --- a/docs/domains/blas/asum.rst +++ b/docs/domains/blas/asum.rst @@ -1,4 +1,4 @@ -.. _asum: +.. _onemkl_blas_asum: asum ==== @@ -10,16 +10,6 @@ asum Computes the sum of magnitudes of the vector elements. - .. container:: section - :name: GUID-C135E117-8018-473E-BE83-8833C95BB3B5 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void asum(queue &exec_queue, std::int64_t n, buffer &x, std::int64_t incx, buffer &result) ``asum`` supports the following precisions. @@ -42,11 +32,9 @@ asum .. container:: section - :name: GUID-6AFCECB5-6614-46AC-B921-AB5DED0D22B2 .. rubric:: Description - :name: description :class: sectiontitle @@ -61,16 +49,27 @@ asum where ``x`` is a vector with ``n`` elements. +asum (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::asum(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, sycl::buffer &result) .. container:: section - :name: GUID-A615800D-734E-4997-BB91-1C76AEEE9EC2 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. @@ -86,15 +85,13 @@ asum incx - Stride of vector x. + Stride of vector ``x``. .. container:: section - :name: GUID-2B160DEB-ADBB-4044-8078-4B613A0DA4E1 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -103,19 +100,84 @@ asum the real and imaginary parts of all elements of the vector). -.. container:: familylinks +asum (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-1-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::asum(sycl::queue &queue, std::int64_t n, const T *x, std::int64_t incx, T_res *result, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + queue + The queue where the routine should be executed. -.. |image0| image:: ../equations/GUID-684BB993-83CA-4605-BD49-E493806C1ee1.png + + n + Number of elements in vector ``x``. + + + x + Pointer to input vector ``x``. The array holding the vector + ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)). + See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + result + Pointer to the output matrix where the scalar result is stored + (the sum of magnitudes of the real and imaginary parts of all + elements of the vector). + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-1-routines` +.. |image0| image:: ../equations/GUID-4F76F5A1-251F-4AC0-A2E0-A3B4B6F39ee1.png :class: img-middle diff --git a/docs/domains/blas/axpy.rst b/docs/domains/blas/axpy.rst index 50f2ed986..b88309698 100644 --- a/docs/domains/blas/axpy.rst +++ b/docs/domains/blas/axpy.rst @@ -1,4 +1,4 @@ -.. _axpy: +.. _onemkl_blas_axpy: axpy ==== @@ -10,16 +10,6 @@ axpy Computes a vector-scalar product and adds the result to a vector. - .. container:: section - :name: GUID-17ADB23B-C9B0-44B4-89F9-B7199DA9E872 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void axpy(queue &exec_queue, std::int64_t n, T alpha, buffer &x, std::int64_t incx, buffer &y, std::int64_t incy) ``axpy`` supports the following precisions. @@ -37,11 +27,9 @@ axpy .. container:: section - :name: GUID-4BC6BF9A-BAB9-4078-A6B5-9C7ECB9D4821 .. rubric:: Description - :name: description :class: sectiontitle @@ -64,21 +52,32 @@ axpy ``alpha`` is a scalar. +axpy (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::axpy(sycl::queue &queue, std::int64_t n, T alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, std::int64_t incy) .. container:: section - :name: GUID-6F86EF6A-8FFE-4C6A-8B71-23B95C1F1365 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. n - Number of elements in vector x. + Number of elements in vector ``x``. alpha @@ -86,50 +85,127 @@ axpy x - Buffer holding input vector x. The buffer must be of size at least + Buffer holding input vector ``x``. The buffer must be of size at least ``(1 + (n – 1)*abs(incx))``. See `Matrix and Vector Storage <../matrix-storage.html>`__ for more details. incx - Stride of vector x. + Stride of vector ``x``. y - Buffer holding input vector y. The buffer must be of size at least + Buffer holding input vector ``y``. The buffer must be of size at least ``(1 + (n – 1)*abs(incy))``. See `Matrix and Vector Storage <../matrix-storage.html>`__ for more details. incy - Stride of vector y. + Stride of vector ``y``. .. container:: section - :name: GUID-A0926D96-B673-48A4-986A-033719589288 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle y - Buffer holding the updated vector y. + Buffer holding the updated vector ``y``. +axpy (USM Version) +------------------ -.. container:: familylinks +.. container:: + .. container:: section - .. container:: parentlink + .. rubric:: Syntax + :class: sectiontitle - **Parent topic:** :ref:`blas-level-1-routines` - + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::axpy(sycl::queue &queue, std::int64_t n, T alpha, const T *x, std::int64_t incx, T *y, std::int64_t incy, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + n + Number of elements in vector ``x``. -.. container:: + alpha + Specifies the scalar alpha. + + + x + Pointer to the input vector ``x``. The array holding the vector + ``x`` must be of size at least ``(1 + (n – 1)*abs(incx))``. See + `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + y + Pointer to the input vector ``y``. The array holding the vector + ``y`` must be of size at least ``(1 + (n – 1)*abs(incy))``. See + `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incy + Stride of vector ``y``. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + y + Pointer to the updated vector ``y``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-1-routines` diff --git a/docs/domains/blas/axpy_batch.rst b/docs/domains/blas/axpy_batch.rst new file mode 100644 index 000000000..4dd9fb86e --- /dev/null +++ b/docs/domains/blas/axpy_batch.rst @@ -0,0 +1,160 @@ +.. _onemkl_blas_axpy_batch: + +axpy_batch +========== + +.. container:: + + + The ``axpy_batch`` routines are batched versions of `axpy `__, performing + multiple ``axpy`` operations in a single call. Each ``axpy`` + operation adds a scalar-vector product to a vector. + + + ``axpy_batch`` supports the following precisions. + + + .. list-table:: + :header-rows: 1 + + * - T + * - ``float`` + * - ``double`` + * - ``std::complex`` + * - ``std::complex`` + + + +axpy_batch (USM Version) +------------------------ + +.. container:: section + + + .. rubric:: Description + :class: sectiontitle + + + The USM version of ``axpy_batch`` supports group API. + + The group API operation is defined as + + :: + + idx = 0 + for i = 0 … group_count – 1 + for j = 0 … group_size – 1 + X and Y are vectors in x[idx] and y[idx] + Y := alpha[i] * X + Y + idx := idx + 1 + end for + end for + + + where: + + ``alpha`` is scalar + + ``X`` and ``Y`` are vectors. + + + For group API, ``x`` and ``y`` arrays contain the pointers for all the input vectors. + The total number of vectors in ``x`` and ``y`` are given by: + + total_batch_count = sum of all of the group_size entries + + + **Group API** + +.. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::axpy_batch(sycl::queue &queue, std::int64_t *n, T *alpha, const T **x, std::int64_t *incx, T **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, const sycl::vector_class &dependencies = {}) + + +.. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + queue + The queue where the routine should be executed. + + n + Array of ``group_count`` integers. ``n[i]`` specifies the number of elements in vectors ``X`` and ``Y`` for every vector in group ``i``. + + + alpha + Array of ``group_count`` scalar elements. ``alpha[i]`` specifies the scaling factor for vector ``X`` in group ``i``. + + + x + Array of pointers to input vectors ``X`` with size ``total_batch_count``. + The size of array allocated for the ``X`` vector of the group ``i`` must be at least ``(1 + (n[i] – 1)*abs(incx[i]))``. + See `Matrix and Vector Storage <../matrix-storage.html>`__ for more details. + + incx + Array of ``group_count`` integers. ``incx[i]`` specifies the stride of vector ``X`` in group ``i``. + + y + Array of pointers to input/output vectors ``Y`` with size ``total_batch_count``. + The size of array allocated for the ``Y`` vector of the group ``i`` must be at least ``(1 + (n[i] – 1)*abs(incy[i]))``. + See `Matrix and Vector Storage <../matrix-storage.html>`__ for more details. + + incy + Array of ``group_count`` integers. ``incy[i]`` specifies the stride of vector ``Y`` in group ``i``. + + + group_count + Number of groups. Must be at least 0. + + + group_size + Array of ``group_count`` integers. ``group_size[i]`` specifies the number of ``axpy`` operations in group ``i``. + Each element in ``group_size`` must be at least 0. + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + +.. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + y + Array of pointers holding the ``Y`` vectors, overwritten by ``total_batch_count`` ``axpy`` operations of the form + ``alpha*X + Y``. + + +.. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:**:ref:`blas-like-extensions` + + + diff --git a/docs/domains/blas/blas-level-1-routines.inc.rst b/docs/domains/blas/blas-level-1-routines.inc.rst deleted file mode 100644 index 7798d13a4..000000000 --- a/docs/domains/blas/blas-level-1-routines.inc.rst +++ /dev/null @@ -1,92 +0,0 @@ -.. _blas-level-1-routines: - -BLAS Level 1 Routines -===================== - - -.. container:: - - - BLAS Level 1 includes routines and functions, which perform - vector-vector operations. The following table lists the BLAS Level 1 - routine and function groups and the data types associated with them. - - - .. container:: tablenoborder - - - .. list-table:: - :header-rows: 1 - - * - Routine or Function Group with SYCL Buffer - - Data Types - - Description - * - \ `asum `__\ - - float, double, mixed float and std::complex, mixed double and std::complex - - Sum of vector magnitudes (functions) - * - \ `axpy `__\ - - float, double, std::complex, std::complex - - Scalar-vector product (routines) - * - \ `copy `__\ - - float, double, std::complex, std::complex - - Copy vector (routines) - * - \ `dot `__\ - - float, double, mixed float and double - - Dot product (functions) - * - \ `sdsdot `__\ - - mixed float and double - - Dot product with double precision (functions) - * - \ `dotc `__\ - - std::complex, std::complex - - Dot product conjugated (functions) - * - \ `dotu `__\ - - std::complex, std::complex - - Dot product unconjugated (functions) - * - \ `nrm2 `__\ - - float, double, mixed float and std::complex, mixed double and std::complex - - Vector 2-norm (Euclidean norm) (functions) - * - \ `rot `__\ - - float, double, mixed float and std::complex, mixed double and std::complex - - Plane rotation of points (routines) - * - \ `rotg `__\ - - float, double, std::complex, std::complex - - Generate Givens rotation of points (routines) - * - \ `rotm `__\ - - float, double - - Modified Givens plane rotation of points (routines) - * - \ `rotmg `__\ - - float, double - - Generate modified Givens plane rotation of points (routines) - * - \ `scal `__\ - - float, double, std::complex, std::complex, mixed float and std::complex, mixed double and std::complex - - Vector-scalar product (routines) - * - \ `swap `__\ - - float, double, std::complex, std::complex - - Vector-vector swap (routines) - * - \ `iamax `__\ - - float, double, std::complex, std::complex - - Index of the maximum absolute value element of a vector (functions) - * - \ `iamin `__\ - - float, double, std::complex, std::complex - - Index of the minimum absolute value element of a vector (functions) - -.. toctree:: - :hidden: - - asum - axpy - copy - dot - dotc - dotu - iamax - iamin - nrm2 - rot - rotg - rotm - rotmg - scal - sdsdot - swap - diff --git a/docs/domains/blas/blas-level-1-routines.rst b/docs/domains/blas/blas-level-1-routines.rst new file mode 100644 index 000000000..569749ee4 --- /dev/null +++ b/docs/domains/blas/blas-level-1-routines.rst @@ -0,0 +1,76 @@ +.. _blas-level-1-routines: + +BLAS Level 1 Routines +===================== + + +.. container:: + + + BLAS Level 1 includes routines which perform + vector-vector operations as described in the following table. + + + .. container:: tablenoborder + + + .. list-table:: + :header-rows: 1 + + * - Routines + - Description + * - \ `asum `__\ + - Sum of vector magnitudes + * - \ `axpy `__\ + - Scalar-vector product + * - \ `copy `__\ + - Copy vector + * - \ `dot `__\ + - Dot product + * - \ `sdsdot `__\ + - Dot product with double precision + * - \ `dotc `__\ + - Dot product conjugated + * - \ `dotu `__\ + - Dot product unconjugated + * - \ `nrm2 `__\ + - Vector 2-norm (Euclidean norm) + * - \ `rot `__\ + - Plane rotation of points + * - \ `rotg `__\ + - Generate Givens rotation of points + * - \ `rotm `__\ + - Modified Givens plane rotation of points + * - \ `rotmg `__\ + - Generate modified Givens plane rotation of points + * - \ `scal `__\ + - Vector-scalar product + * - \ `swap `__\ + - Vector-vector swap + * - \ `iamax `__\ + - Index of the maximum absolute value element of a vector + * - \ `iamin `__\ + - Index of the minimum absolute value element of a vector + +.. toctree:: + :hidden: + + asum + axpy + copy + dot + sdsdot + dotc + dotu + nrm2 + rot + rotg + rotm + rotmg + scal + swap + iamax + iamin + + +**Parent topic:** :ref:`onemkl_blas` diff --git a/docs/domains/blas/blas-level-2-routines.inc.rst b/docs/domains/blas/blas-level-2-routines.inc.rst deleted file mode 100644 index dbe97bbae..000000000 --- a/docs/domains/blas/blas-level-2-routines.inc.rst +++ /dev/null @@ -1,130 +0,0 @@ -.. _blas-level-2-routines: - -BLAS Level 2 Routines -===================== - - -.. container:: - - - This section describes BLAS Level 2 routines, which perform - matrix-vector operations. The following table lists the BLAS Level 2 - routine groups and the data types associated with them. - - - .. container:: tablenoborder - - - .. list-table:: - :header-rows: 1 - - * - Routine or Function Group with SYCL Buffer - - Data Types - - Description - * - \ `gbmv `__\ - - float, double, std::complex, std::complex - - Matrix-vector product using a general band matrix - * - \ `gemv `__\ - - float, double, std::complex, std::complex - - Matrix-vector product using a general matrix - * - \ `ger `__\ - - float, double - - Rank-1 update of a general matrix - * - \ `gerc `__\ - - std::complex, std::complex - - Rank-1 update of a conjugated general matrix - * - \ `geru `__\ - - std::complex, std::complex - - Rank-1 update of a general matrix, unconjugated - * - \ `hbmv `__\ - - std::complex, std::complex - - Matrix-vector product using a Hermitian band matrix - * - \ `hemv `__\ - - std::complex, std::complex - - Matrix-vector product using a Hermitian matrix - * - \ `her `__\ - - std::complex, std::complex - - Rank-1 update of a Hermitian matrix - * - \ `her2 `__\ - - std::complex, std::complex - - Rank-2 update of a Hermitian matrix - * - \ `hpmv `__\ - - std::complex, std::complex - - Matrix-vector product using a Hermitian packed matrix - * - \ `hpr `__\ - - std::complex, std::complex - - Rank-1 update of a Hermitian packed matrix - * - \ `hpr2 `__\ - - std::complex, std::complex - - Rank-2 update of a Hermitian packed matrix - * - \ `sbmv `__\ - - float, double - - Matrix-vector product using symmetric band matrix - * - \ `spmv `__\ - - float, double - - Matrix-vector product using a symmetric packed matrix - * - \ `spr `__\ - - float, double - - Rank-1 update of a symmetric packed matrix - * - \ `spr2 `__\ - - float, double - - Rank-2 update of a symmetric packed matrix - * - \ `symv `__\ - - float, double - - Matrix-vector product using a symmetric matrix - * - \ `syr `__\ - - float, double - - Rank-1 update of a symmetric matrix - * - \ `syr2 `__\ - - float, double - - Rank-2 update of a symmetric matrix - * - \ `tbmv `__\ - - float, double, std::complex, std::complex - - Matrix-vector product using a triangular band matrix - * - \ `tbsv `__\ - - float, double, std::complex, std::complex - - Solution of a linear system of equations with a triangular band matrix - * - \ `tpmv `__\ - - float, double, std::complex, std::complex - - Matrix-vector product using a triangular packed matrix - * - \ `tpsv `__\ - - float, double, std::complex, std::complex - - Solution of a linear system of equations with a triangular packed matrix - * - \ `trmv `__\ - - float, double, std::complex, std::complex - - Matrix-vector product using a triangular matrix - * - \ `trsv `__\ - - float, double, std::complex, std::complex - - Solution of a linear system of equations with a triangular matrix - - - - -.. toctree:: - :hidden: - - gbmv - gemv - ger - gerc - geru - hbmv - hemv - her - her2 - hpmv - hpr - hpr2 - sbmv - spmv - spr - spr2 - symv - syr - syr2 - tbmv - tbsv - tpmv - tpsv - trmv - trsv diff --git a/docs/domains/blas/blas-level-2-routines.rst b/docs/domains/blas/blas-level-2-routines.rst new file mode 100644 index 000000000..1ff643beb --- /dev/null +++ b/docs/domains/blas/blas-level-2-routines.rst @@ -0,0 +1,104 @@ +.. _blas-level-2-routines: + +BLAS Level 2 Routines +===================== + + +.. container:: + + BLAS Level 2 includes routines which perform + matrix-vector operations as described in the following table. + + + .. container:: tablenoborder + + + .. list-table:: + :header-rows: 1 + + * - Routines + - Description + * - \ `gbmv `__\ + - Matrix-vector product using a general band matrix + * - \ `gemv `__\ + - Matrix-vector product using a general matrix + * - \ `ger `__\ + - Rank-1 update of a general matrix + * - \ `gerc `__\ + - Rank-1 update of a conjugated general matrix + * - \ `geru `__\ + - Rank-1 update of a general matrix, unconjugated + * - \ `hbmv `__\ + - Matrix-vector product using a Hermitian band matrix + * - \ `hemv `__\ + - Matrix-vector product using a Hermitian matrix + * - \ `her `__\ + - Rank-1 update of a Hermitian matrix + * - \ `her2 `__\ + - Rank-2 update of a Hermitian matrix + * - \ `hpmv `__\ + - Matrix-vector product using a Hermitian packed matrix + * - \ `hpr `__\ + - Rank-1 update of a Hermitian packed matrix + * - \ `hpr2 `__\ + - Rank-2 update of a Hermitian packed matrix + * - \ `sbmv `__\ + - Matrix-vector product using symmetric band matrix + * - \ `spmv `__\ + - Matrix-vector product using a symmetric packed matrix + * - \ `spr `__\ + - Rank-1 update of a symmetric packed matrix + * - \ `spr2 `__\ + - Rank-2 update of a symmetric packed matrix + * - \ `symv `__\ + - Matrix-vector product using a symmetric matrix + * - \ `syr `__\ + - Rank-1 update of a symmetric matrix + * - \ `syr2 `__\ + - Rank-2 update of a symmetric matrix + * - \ `tbmv `__\ + - Matrix-vector product using a triangular band matrix + * - \ `tbsv `__\ + - Solution of a linear system of equations with a triangular band matrix + * - \ `tpmv `__\ + - Matrix-vector product using a triangular packed matrix + * - \ `tpsv `__\ + - Solution of a linear system of equations with a triangular packed matrix + * - \ `trmv `__\ + - Matrix-vector product using a triangular matrix + * - \ `trsv `__\ + - Solution of a linear system of equations with a triangular matrix + + + + +.. toctree:: + :hidden: + + gbmv + gemv + ger + gerc + geru + hbmv + hemv + her + her2 + hpmv + hpr + hpr2 + sbmv + spmv + spr + spr2 + symv + syr + syr2 + tbmv + tbsv + tpmv + tpsv + trmv + trsv + +**Parent topic:** :ref:`onemkl_blas` diff --git a/docs/domains/blas/blas-level-3-routines.inc.rst b/docs/domains/blas/blas-level-3-routines.inc.rst deleted file mode 100644 index a80c18bfc..000000000 --- a/docs/domains/blas/blas-level-3-routines.inc.rst +++ /dev/null @@ -1,90 +0,0 @@ -.. _blas-level-3-routines: - -BLAS Level 3 Routines -===================== - - -.. container:: - - - BLAS Level 3 routines perform matrix-matrix operations. The following - table lists the BLAS Level 3 routine groups and the data types - associated with them. - - - .. container:: tablenoborder - - - .. list-table:: - :header-rows: 1 - - * - Routine or Function Group with SYCL Buffer - - Data Types - - Description - * - \ `gemm `__\ - - float, double, std::complex, std::complex - - Computes a matrix-matrix product with general matrices. - * - \ `hemm `__\ - - std::complex, std::complex - - Computes a matrix-matrix product where one input matrix is Hermitian and one is general. - * - \ `herk `__\ - - std::complex, std::complex - - Performs a Hermitian rank-k update. - * - \ `her2k `__\ - - std::complex, std::complex - - Performs a Hermitian rank-2k update. - * - \ `symm `__\ - - float, double, std::complex, std::complex - - Computes a matrix-matrix product where one input matrix is symmetric and one matrix is general. - * - \ `syrk `__\ - - float, double, std::complex, std::complex - - Performs a symmetric rank-k update. - * - \ `syr2k `__\ - - float, double, std::complex, std::complex - - Performs a symmetric rank-2k update. - * - \ `trmm `__\ - - float, double, std::complex, std::complex - - Computes a matrix-matrix product where one input matrix is triangular and one input matrix is general. - * - \ `trsm `__\ - - float, double, std::complex, std::complex - - Solves a triangular matrix equation (forward or backward solve). - - - - - - - - - .. container:: - :name: LI_21BA86AC0A4942A79BA0C7DC4ABC50C4 - - - The BLAS functions are blocked where possible to restructure - the code in a way that increases the localization of data - reference, enhances cache memory use, and reduces the - dependency on the memory bus. - - - - - - - .. container:: - :name: LI_9D82DEDFA672416D9B3EA8C9C2B6F0A3 - - - The code is distributed across the processors to maximize - parallelism. - - -.. toctree:: - :hidden: - - gemm - hemm - her2k - herk - symm - syr2k - syrk - trmm - trsm diff --git a/docs/domains/blas/blas-level-3-routines.rst b/docs/domains/blas/blas-level-3-routines.rst new file mode 100644 index 000000000..ffdf0b45e --- /dev/null +++ b/docs/domains/blas/blas-level-3-routines.rst @@ -0,0 +1,56 @@ +.. _blas-level-3-routines: + +BLAS Level 3 Routines +===================== + + +.. container:: + + + BLAS Level 3 includes routines which perform + matrix-matrix operations as described in the following table. + + + .. container:: tablenoborder + + + .. list-table:: + :header-rows: 1 + + * - Routines + - Description + * - \ `gemm `__\ + - Computes a matrix-matrix product with general matrices. + * - \ `hemm `__\ + - Computes a matrix-matrix product where one input matrix is Hermitian and one is general. + * - \ `herk `__\ + - Performs a Hermitian rank-k update. + * - \ `her2k `__\ + - Performs a Hermitian rank-2k update. + * - \ `symm `__\ + - Computes a matrix-matrix product where one input matrix is symmetric and one matrix is general. + * - \ `syrk `__\ + - Performs a symmetric rank-k update. + * - \ `syr2k `__\ + - Performs a symmetric rank-2k update. + * - \ `trmm `__\ + - Computes a matrix-matrix product where one input matrix is triangular and one input matrix is general. + * - \ `trsm `__\ + - Solves a triangular matrix equation (forward or backward solve). + + + +.. toctree:: + :hidden: + + gemm + hemm + herk + her2k + symm + syrk + syr2k + trmm + trsm + +**Parent topic:** :ref:`onemkl_blas` diff --git a/docs/domains/blas/blas-like-extensions.rst b/docs/domains/blas/blas-like-extensions.rst new file mode 100644 index 000000000..296ceb522 --- /dev/null +++ b/docs/domains/blas/blas-like-extensions.rst @@ -0,0 +1,47 @@ +.. _blas-like-extensions: + +BLAS-like Extensions +==================== + + +.. container:: + + + oneAPI Math Kernel Library DPC++ provides additional routines to + extend the functionality of the BLAS routines. These include routines + to compute many independent matrix-matrix products. + + The following table lists the BLAS-like Extensions with their descriptions. + + + .. container:: tablenoborder + + + .. list-table:: + :header-rows: 1 + + * - Routines + - Description + * - \ `axpy_batch `__\ + - Computes groups of vector-scalar product added to a vector. + * - \ `gemm_batch `__\ + - Computes groups of matrix-matrix products with general matrices. + * - \ `trsm_batch `__\ + - Solves a triangular matrix equation for a group of matrices. + * - \ `gemmt `__\ + - Computes a matrix-matrix product with general matrices, but updates + only the upper or lower triangular part of the result matrix. + * - \ `gemm_ext `__\ + - Computes a matrix-matrix product with general matrices + + +.. toctree:: + :hidden: + + axpy_batch + gemm_batch + trsm_batch + gemmt + gemm_ext + +**Parent topic:** :ref:`onemkl_blas` diff --git a/docs/domains/blas/blas.rst b/docs/domains/blas/blas.rst index 313673bf4..c6124fce7 100644 --- a/docs/domains/blas/blas.rst +++ b/docs/domains/blas/blas.rst @@ -3,10 +3,15 @@ BLAS Routines +++++++++++++ -oneMKL provides a DPC++ interface to the Basic Linear Algebra Subprograms (BLAS) routines. +oneMKL provides a DPC++ interface to the Basic Linear Algebra Subprograms (BLAS) routines, as well as several BLAS-like extension routines. + +.. toctree:: + :maxdepth: 1 + + blas-level-1-routines.rst + blas-level-2-routines.rst + blas-level-3-routines.rst + blas-like-extensions.rst -.. include:: blas-level-1-routines.inc.rst -.. include:: blas-level-2-routines.inc.rst -.. include:: blas-level-3-routines.inc.rst **Parent topic:** :ref:`onemkl` diff --git a/docs/domains/blas/copy.rst b/docs/domains/blas/copy.rst index df47419aa..e2a5a3230 100644 --- a/docs/domains/blas/copy.rst +++ b/docs/domains/blas/copy.rst @@ -1,4 +1,4 @@ -.. _copy: +.. _onemkl_blas_copy: copy ==== @@ -10,16 +10,6 @@ copy Copies a vector to another vector. - .. container:: section - :name: GUID-D6B6C72E-9516-40C9-B034-9F344C41AAF3 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void copy(queue &exec_queue, std::int64_t n, buffer &x, std::int64_t incx, buffer &y, std::int64_t incy) ``copy`` supports the following precisions. @@ -37,11 +27,9 @@ copy .. container:: section - :name: GUID-5E0A9C5F-BDD5-41E6-97CD-4316FD58C347 .. rubric:: Description - :name: description :class: sectiontitle @@ -54,63 +42,140 @@ copy y ←x - where x and y are vectors of n elements. + where ``x`` and ``y`` are vectors of n elements. + + +copy (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + .. cpp:function:: void onemkl::blas::copy(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, std::int64_t incy) .. container:: section - :name: GUID-6F86EF6A-8FFE-4C6A-8B71-23B95C1F1365 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. n - Number of elements in vector x. + Number of elements in vector ``x``. x - Buffer holding input vector x. The buffer must be of size at least + Buffer holding input vector ``x``. The buffer must be of size at least ``(1 + (n – 1)*abs(incx))``. See `Matrix and Vector Storage <../matrix-storage.html>`__ for more details. incx - Stride of vector x. + Stride of vector ``x``. incy - Stride of vector y. + Stride of vector ``y``. .. container:: section - :name: GUID-4ABB603B-835C-428B-B880-2F088BAB5456 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle y - Buffer holding the updated vector y. + Buffer holding the updated vector ``y``. -.. container:: familylinks +copy (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-1-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::copy(sycl::queue &queue, std::int64_t n, const T *x, std::int64_t incx, T *y, std::int64_t incy, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + n + Number of elements in vector ``x``. + + + x + Pointer to the input vector ``x``. The array holding the vector + ``x`` must be of size at least ``(1 + (n – 1)*abs(incx))``. See + `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + incy + Stride of vector ``y``. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + y + Pointer to the updated vector ``y``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-1-routines` diff --git a/docs/domains/blas/dot.rst b/docs/domains/blas/dot.rst index 7388eed8c..7a6a5d7c4 100644 --- a/docs/domains/blas/dot.rst +++ b/docs/domains/blas/dot.rst @@ -1,4 +1,4 @@ -.. _dot: +.. _onemkl_blas_dot: dot === @@ -10,16 +10,6 @@ dot Computes the dot product of two real vectors. - .. container:: section - :name: GUID-13355B56-0278-45E5-B310-3B0AC541C675 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void dot(queue &exec_queue, std::int64_t n, buffer &x, std::int64_t incx, buffer &y, std::int64_t incy, buffer &result) ``dot`` supports the following precisions. @@ -40,11 +30,9 @@ dot .. container:: section - :name: GUID-4BC6BF9A-BAB9-4078-A6B5-9C7ECB9D4821 .. rubric:: Description - :name: description :class: sectiontitle @@ -58,7 +46,6 @@ dot .. rubric:: Note - :name: note :class: NoteTipHead @@ -66,51 +53,60 @@ dot double), the dot product is computed with double precision. +dot (Buffer Version) +-------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::dot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, std::int64_t incy, sycl::buffer &result) .. container:: section - :name: GUID-6F86EF6A-8FFE-4C6A-8B71-23B95C1F1365 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. n - Number of elements in vectors x and y. + Number of elements in vectors ``x`` and ``y``. x - Buffer holding input vector x. The buffer must be of size at least + Buffer holding input vector ``x``. The buffer must be of size at least ``(1 + (n – 1)*abs(incx))``. See `Matrix and Vector Storage <../matrix-storage.html>`__ for more details. incx - Stride of vector x. + Stride of vector ``x``. y - Buffer holding input vector y. The buffer must be of size at least + Buffer holding input vector ``y``. The buffer must be of size at least ``(1 + (n – 1)*abs(incy))``. See `Matrix and Vector Storage <../matrix-storage.html>`__ for more details. incy - Stride of vector y. + Stride of vector ``y``. .. container:: section - :name: GUID-CAAFE234-AF82-4B61-8406-D57EC527BED5 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -118,19 +114,94 @@ dot Buffer where the result (a scalar) will be stored. -.. container:: familylinks +dot (USM Version) +----------------- +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-1-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::dot(sycl::queue &queue, std::int64_t n, const T *x, std::int64_t incx, const T *y, std::int64_t incy, T_res *result, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + n + Number of elements in vectors ``x`` and ``y``. -.. |image0| image:: ../equations/GUID-93DA36DC-40CA-4C01-B883-DABAB0D37ee1.png + + x + Pointer to the input vector ``x``. The array holding the vector ``x`` + must be of size at least ``(1 + (n – 1)*abs(incx))``. See + `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + y + Pointer to the input vector ``y``. The array holding the vector ``y`` + must be of size at least ``(1 + (n – 1)*abs(incy))``. See + `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incy + Stride of vector ``y``. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + result + Pointer to where the result (a scalar) will be stored. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-1-routines` +.. |image0| image:: ../equations/GUID-75532DED-BE44-4D85-B9C0-99C825778ee1.png :class: img-middle diff --git a/docs/domains/blas/dotc.rst b/docs/domains/blas/dotc.rst index 08e07d1d3..dde06cbf8 100644 --- a/docs/domains/blas/dotc.rst +++ b/docs/domains/blas/dotc.rst @@ -1,4 +1,4 @@ -.. _dotc: +.. _onemkl_blas_dotc: dotc ==== @@ -11,16 +11,6 @@ dotc first vector. - .. container:: section - :name: GUID-9D36611B-564D-475B-8D98-5F53A4F698F5 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void dotc(queue &exec_queue, std::int64_t n, buffer &x, std::int64_t incx, buffer &y, std::int64_t incy, buffer &result) ``dotc`` supports the following precisions. @@ -36,11 +26,9 @@ dotc .. container:: section - :name: GUID-3E4588D2-5FDE-43F1-955E-85173AE62252 .. rubric:: Description - :name: description :class: sectiontitle @@ -51,16 +39,27 @@ dotc |image0| +dotc (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::dotc(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, std::int64_t incy, sycl::buffer &result) .. container:: section - :name: GUID-38675523-DEDD-4314-8486-7C66614ED2C7 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. @@ -91,11 +90,9 @@ dotc .. container:: section - :name: GUID-B84A5D05-6B61-4D13-8185-2A349C41CE46 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -103,19 +100,94 @@ dotc The buffer where the result (a scalar) is stored. -.. container:: familylinks +dotc (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-1-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: void onemkl::blas::dotc(sycl::queue &queue, std::int64_t n, const T *x, std::int64_t incx, const T *y, std::int64_t incy, T *result, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + n + The number of elements in vectors ``x`` and ``y``. -.. |image0| image:: ../equations/GUID-AED001B6-9056-491F-ACBE-E06C82D17ee1.png + + x + Pointer to input vector ``x``. The array holding the input + vector ``x`` must be of size at least (1 + (``n`` - + 1)*abs(``incx``)). See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + The stride of vector ``x``. + + + y + Pointer to input vector ``y``. The array holding the input + vector ``y`` must be of size at least (1 + (``n`` - + 1)*abs(``incy``)). See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details.. + + + incy + The stride of vector ``y``. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + result + The pointer to where the result (a scalar) is stored. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-1-routines` +.. |image0| image:: ../equations/GUID-B2211D34-A472-4FB8-9CFB-1E11AF4F0ee1.png :class: img-middle diff --git a/docs/domains/blas/dotu.rst b/docs/domains/blas/dotu.rst index 15cb71e7d..607989837 100644 --- a/docs/domains/blas/dotu.rst +++ b/docs/domains/blas/dotu.rst @@ -1,4 +1,4 @@ -.. _dotu: +.. _onemkl_blas_dotu: dotu ==== @@ -10,16 +10,6 @@ dotu Computes the dot product of two complex vectors. - .. container:: section - :name: GUID-27A695AE-7ED5-4CFF-9783-0E50D111BED2 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void dotu(queue &exec_queue, std::int64_t n, buffer &x, std::int64_t incx, buffer &y, std::int64_t incy, buffer &result) ``dotu`` supports the following precisions. @@ -35,11 +25,9 @@ dotu .. container:: section - :name: GUID-7E67CFC6-917F-41A3-A664-F99EE4E04E43 .. rubric:: Description - :name: description :class: sectiontitle @@ -49,16 +37,27 @@ dotu |image0| +dotu (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::dotu(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, std::int64_t incy, sycl::buffer &result) .. container:: section - :name: GUID-A615800D-734E-4997-BB91-1C76AEEE9EC2 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. @@ -74,7 +73,7 @@ dotu incx - Stride of vector x. + Stride of vector ``x``. y @@ -85,15 +84,13 @@ dotu incy - Stride of vector y. + Stride of vector ``y``. .. container:: section - :name: GUID-2B160DEB-ADBB-4044-8078-4B613A0DA4E1 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -101,19 +98,94 @@ dotu Buffer where the result (a scalar) is stored. -.. container:: familylinks +dotu (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-1-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::dotu(sycl::queue &queue, std::int64_t n, const T *x, std::int64_t incx, const T *y, std::int64_t incy, T *result, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + n + Number of elements in vectors ``x`` and ``y``. -.. |image0| image:: ../equations/GUID-3605ACD9-02D1-46D7-B791-F2F76F0D9ee1.png + + x + Pointer to the input vector ``x``. The array holding input + vector ``x`` must be of size at least (1 + (``n`` - + 1)*abs(``incx``)). See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + y + Pointer to input vector ``y``. The array holding input vector + ``y`` must be of size at least (1 + (``n`` - 1)*abs(``incy``)). + See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incy + Stride of vector ``y``. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + result + Pointer to where the result (a scalar) is stored. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-1-routines` +.. |image0| image:: ../equations/GUID-42AF2BFE-F8F1-4F96-A4E0-05D4FB5A7ee1.png :class: img-middle diff --git a/docs/domains/blas/gbmv.rst b/docs/domains/blas/gbmv.rst index 524d52972..f07fee183 100644 --- a/docs/domains/blas/gbmv.rst +++ b/docs/domains/blas/gbmv.rst @@ -1,4 +1,4 @@ -.. _gbmv: +.. _onemkl_blas_gbmv: gbmv ==== @@ -10,16 +10,6 @@ gbmv Computes a matrix-vector product with a general band matrix. - .. container:: section - :name: GUID-870EA7B0-09B5-43FF-90A4-6378B5D94B55 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void gbmv(queue &exec_queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, T alpha, buffer &a, std::int64_t lda, buffer &x, std::int64_t incx, T beta, buffer &y, std::int64_t incy) ``gbmv`` supports the following precisions. @@ -37,11 +27,9 @@ gbmv .. container:: section - :name: GUID-71614419-BC91-4A1A-B743-FE52767C4926 .. rubric:: Description - :name: description :class: sectiontitle @@ -73,16 +61,27 @@ gbmv - ``x`` and ``y`` are vectors. +gbmv (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, T alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &x, std::int64_t incx, T beta, sycl::buffer &y, std::int64_t incy) .. container:: section - :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. @@ -159,11 +158,9 @@ gbmv .. container:: section - :name: GUID-4B31584D-BC63-4032-A4A7-61BF3F163165 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -171,15 +168,138 @@ gbmv Buffer holding the updated vector ``y``. -.. container:: familylinks +gbmv (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-2-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, T alpha, const T *a, std::int64_t lda, const T *x, std::int64_t incx, T beta, T *y, std::int64_t incy, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + trans + Specifies op(``A``), the transposition operation applied to + ``A``. See + :ref:`onemkl_datatypes` for + more details. + + + m + Number of rows of ``A``. Must be at least zero. + + + n + Number of columns of ``A``. Must be at least zero. + + + kl + Number of sub-diagonals of the matrix ``A``. Must be at least + zero. + + + ku + Number of super-diagonals of the matrix ``A``. Must be at least + zero. + + alpha + Scaling factor for the matrix-vector product. + + + a + Pointer to input matrix ``A``. The array holding input matrix + ``A`` must have size at least ``lda``\ \*\ ``n``. See `Matrix + and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + lda + Leading dimension of matrix ``A``. Must be at least (``kl`` + + ``ku`` + 1), and positive. + + + x + Pointer to input vector ``x``. The length ``len`` of vector + ``x`` is ``n`` if ``A`` is not transposed, and ``m`` if ``A`` + is transposed. The array holding input vector ``x`` must be of + size at least (1 + (``len`` - 1)*abs(``incx``)). See `Matrix + and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + beta + Scaling factor for vector ``y``. + + + y + Pointer to input/output vector ``y``. The length ``len`` of + vector ``y`` is ``m``, if ``A`` is not transposed, and ``n`` if + ``A`` is transposed. The array holding input/output vector + ``y`` must be of size at least (1 + (``len`` - + 1)*abs(``incy``)) where ``len`` is this length. See `Matrix and + Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incy + Stride of vector ``y``. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + y + Pointer to the updated vector ``y``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-2-routines` diff --git a/docs/domains/blas/gemm.rst b/docs/domains/blas/gemm.rst index 8e529b1e8..9670344b2 100644 --- a/docs/domains/blas/gemm.rst +++ b/docs/domains/blas/gemm.rst @@ -1,4 +1,4 @@ -.. _gemm: +.. _onemkl_blas_gemm: gemm ==== @@ -10,16 +10,6 @@ gemm Computes a matrix-matrix product with general matrices. - .. container:: section - :name: GUID-7885D940-FAC1-4F37-9E1C-A022DED99EBD - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void gemm(queue &exec_queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, T alpha, buffer &a, std::int64_t lda, buffer &b, std::int64_t ldb, T beta, buffer &c, std::int64_t ldc) ``gemm`` supports the following precisions. @@ -38,15 +28,13 @@ gemm .. container:: section - :name: GUID-14237C95-6322-47A4-BC11-D3CDD2118C42 .. rubric:: Description - :name: description :class: sectiontitle - The gemm routines compute a scalar-matrix-matrix product and add the + The ``gemm`` routines compute a scalar-matrix-matrix product and add the result to a scalar-matrix product, with general matrices. The operation is defined as @@ -79,31 +67,37 @@ gemm ``C`` is an ``m``-by-``n`` matrix. +gemm (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, T alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &b, std::int64_t ldb, T beta, sycl::buffer &c, std::int64_t ldc) .. container:: section - :name: GUID-D89C4959-F0C2-4E91-8853-9225F0772DF0 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. transa Specifies the form of ``op(A)``, the transposition operation - applied to ``A``. See - :ref:`onemkl_datatypes` - for more details. - + applied to ``A``. transb Specifies the form of ``op(B)``, the transposition operation - applied to ``B``. See - :ref:`onemkl_datatypes` - for more details. + applied to ``B``. m @@ -176,11 +170,9 @@ gemm .. container:: section - :name: GUID-EEF5C7D0-D206-4961-809F-55DCA3E93F68 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -190,11 +182,9 @@ gemm .. container:: section - :name: GUID-AC72653A-4AC8-4B9D-B7A9-13A725AA19BF .. rubric:: Notes - :name: notes :class: sectiontitle @@ -202,15 +192,154 @@ gemm calling ``gemm``. -.. container:: familylinks +gemm (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-3-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, T alpha, const T *a, std::int64_t lda, const T *b, std::int64_t ldb, T beta, T *c, std::int64_t ldc, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + transa + Specifies the form of ``op(A)``, the transposition operation + applied to ``A``. + + + transb + Specifies the form of ``op(B)``, the transposition operation + applied to ``B``. + + + m + Specifies the number of rows of the matrix ``op(A)`` and of the + matrix ``C``. The value of m must be at least zero. + + + n + Specifies the number of columns of the matrix ``op(B)`` and the + number of columns of the matrix ``C``. The value of n must be + at least zero. + + + k + Specifies the number of columns of the matrix ``op(A)`` and the + number of rows of the matrix ``op(B)``. The value of k must be + at least zero. + + + alpha + Scaling factor for the matrix-matrix product. + + a + Pointer to input matrix ``A``. If ``A`` is not transposed, + ``A`` is an ``m``-by-``k`` matrix so the array ``a`` must have + size at least ``lda``\ \*\ ``k``. If ``A`` is transposed, ``A`` + is an ``k``-by-``m`` matrix so the array ``a`` must have size + at least ``lda``\ \*\ ``m``. See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + lda + The leading dimension of ``A``. Must be at least m if ``A`` is + not transposed, and at least k if ``A`` is transposed. It must + be positive. + + + b + Pointer to input matrix ``B``. If ``B`` is not transposed, + ``B`` is an ``k``-by-``n`` matrix so the array ``b`` must have + size at least ``ldb``\ \*\ ``n``. If ``B`` is transposed, ``B`` + is an ``n``-by-``k`` matrix so the array ``b`` must have size + at least ``ldb``\ \*\ ``k``. See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + ldb + The leading dimension of ``B``. Must be at least k if ``B`` is + not transposed, and at least n if ``B`` is transposed. It must + be positive. + + + beta + Scaling factor for matrix ``C``. + + + c + The pointer to input/output matrix ``C``. It must have a size + of at least ldc\*n. See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + ldc + The leading dimension of ``C``. It must be positive and at + least the size of m. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + c + Pointer to the output matrix, overwritten by + ``alpha*op(A)*op(B) + beta*C``. + + + .. container:: section + + + .. rubric:: Notes + :class: sectiontitle + + + If ``beta`` = 0, matrix ``C`` does not need to be initialized + before calling ``gemm``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-3-routines` diff --git a/docs/domains/blas/gemm_batch.rst b/docs/domains/blas/gemm_batch.rst index 11034ab68..b4c940afc 100644 --- a/docs/domains/blas/gemm_batch.rst +++ b/docs/domains/blas/gemm_batch.rst @@ -1,4 +1,4 @@ -.. _gemm_batch: +.. _onemkl_blas_gemm_batch: gemm_batch ========== @@ -6,81 +6,214 @@ gemm_batch .. container:: + The ``gemm_batch`` routines are batched versions of `gemm `__, performing + multiple ``gemm`` operations in a single call. Each ``gemm`` + operation perform a matrix-matrix product with general matrices. + + + ``gemm_batch`` supports the following precisions. - Computes groups of matrix-matrix product with general matrices. + .. list-table:: + :header-rows: 1 - .. container:: section - :name: GUID-7885D940-FAC1-4F37-9E1C-A022DED99EBD + * - T + * - ``float`` + * - ``double`` + * - ``std::complex`` + * - ``std::complex`` - .. rubric:: Syntax - :name: syntax - :class: sectiontitle +gemm_batch (Buffer Version) +--------------------------- +.. container:: section - **Group API** + .. rubric:: Description + :class: sectiontitle - .. cpp:function:: void gemm_batch(queue &exec_queue, buffer &transa_array, buffer &transb_array, buffer &m_array, buffer &n_array, buffer &k_array, buffer alpha_array, buffer &a_array, buffer &lda_array, buffer &b_array, buffer ldb_array, buffer &beta_array, buffer &c, buffer &ldc_array, std::int64_t group_count, buffer &group_size_array) - **Strided API** + The buffer version of ``gemm_batch`` supports only the strided API. + + The strided API operation is defined as - .. cpp:function:: void gemm_batch(queue &exec_queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, T alpha, buffer &a, std::int64_t &lda, std::int64_t stridea, buffer &b, std::int64_t ldb, std::int64_t strideb, T beta, buffer &c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size) + :: - ``gemm_batch`` supports the following precisions. + for i = 0 … batch_size – 1 + A, B and C are matrices at offset i * stridea, i * strideb, i * stridec in a, b and c. + C := alpha * op(A) * op(B) + beta * C + end for - .. list-table:: - :header-rows: 1 - * - T - * - ``float`` - * - ``double`` - * - ``std::complex`` - * - ``std::complex`` + where: + + op(X) is one of op(X) = X, or op(X) = X\ :sup:`T`, or op(X) = X\ :sup:`H` + ``alpha`` and ``beta`` are scalars + + + ``A``, ``B``, and ``C`` are matrices + + op(``A``) is ``m``\ ``x``\ ``k``, op(``B``) is + ``k``\ ``x``\ ``n``, and ``C`` is ``m``\ ``x``\ ``n``. + + The a, b and c buffers contain all the input matrices. The stride + between matrices is given by the stride parameter. The total number + of matrices in a, b and c buffers is given by the ``batch_size`` parameter. + + **Strided API** .. container:: section - :name: GUID-14237C95-6322-47A4-BC11-D3CDD2118C42 - .. rubric:: Description - :name: description + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, T alpha, sycl::buffer &a, std::int64_t lda, std::int64_t stridea, sycl::buffer &b, std::int64_t ldb, std::int64_t strideb, T beta, sycl::buffer &c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size) + + +.. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + transa + Specifies ``op(A)`` the transposition operation applied to the + matrices ``A``. See :ref:`onemkl_datatypes` for more details. + + transb + Specifies ``op(B)`` the transposition operation applied to the + matrices ``B``. See :ref:`onemkl_datatypes` for more details. + + m + Number of rows of ``op(A)`` and ``C``. Must be at least zero. + + + n + Number of columns of ``op(B)`` and ``C``. Must be at least zero. + + + k + Number of columns of ``op(A)`` and rows of ``op(B)``. Must be at + least zero. + + + alpha + Scaling factor for the matrix-matrix products. + + + a + Buffer holding the input matrices ``A`` with size ``stridea*batch_size``. + + + lda + Leading dimension of the matrices ``A``. Must be at least ``m`` if + the matrices ``A`` are not transposed, and at least ``k`` if the + matrices ``A`` are transposed. Must be positive. + + + stridea + Stride between different ``A`` matrices. + + + b + Buffer holding the input matrices ``B`` with size ``strideb*batch_size``. + + + ldb + Leading dimension of the matrices ``B``. Must be at least ``k`` if + the matrices ``B`` are not transposed, and at least ``n`` if the + matrices ``B`` are transposed. Must be positive. + + + strideb + Stride between different ``B`` matrices. + + + beta + Scaling factor for the matrices ``C``. + + + c + Buffer holding input/output matrices ``C`` with size ``stridec*batch_size``. + + + ldc + Leading dimension of ``C``. Must be positive and at least ``m``. + + + stridec + Stride between different ``C`` matrices. Must be at least + ``ldc*n``. + + + batch_size + Specifies the number of matrix multiply operations to perform. + + +.. container:: section + + + .. rubric:: Output Parameters :class: sectiontitle - The gemm_batch routines perform a series of matrix-matrix operations - with general matrices. They are similar to the gemm routine - counterparts, but the gemm_batch routines perform matrix-matrix - operations with groups of matrices. The groups contain matrices with - the same parameters. + c + Output buffer, overwritten by ``batch_size`` matrix multiply + operations of the form\ ``alpha*op(A)*op(B) + beta*C``. + + +.. container:: section + + + .. rubric:: Notes + :class: sectiontitle + + + If ``beta`` = 0, matrix ``C`` does not need to be initialized before + calling ``gemm_batch``. + + +gemm_batch (USM Version) +--------------------------- + +.. container:: section + + .. rubric:: Description + :class: sectiontitle + + The USM version of ``gemm_batch`` supports the group API and strided API. - For the group API, the operation is defined as + The group API operation is defined as :: - offa = 0, offb = 0, offc = 0 + idx = 0 for i = 0 … group_count – 1 - transa, transb, m, n, k, lda, ldb, ldc, alpha, beta and group_size at position i in transa_array, transb_array, m_array, n_array, k_array, lda_array, ldb_array, ldc_array, alpha_array, beta_array and group_size_array - sizea = transa == onemkl::transpose::N ? lda * k : lda * m; - sizeb = transb == onemkl::transpose::N ? ldb * n : ldb * k; - sizec = ldc * n; for j = 0 … group_size – 1 - A, B, and C are matrices of size sizea, sizeb and sizec at offset offa, offb and offc in a, b and c. - C := alpha * op(A) * op(B) + beta * C - offa += sizea, offb += sizeb, offc += sizec + A, B, and C are matrices in a[idx], b[idx] and c[idx] + C := alpha[i] * op(A) * op(B) + beta[i] * C + idx = idx + 1 end for end for - For the strided API, the operation is defined as + The strided API operation is defined as :: @@ -88,220 +221,214 @@ gemm_batch for i = 0 … batch_size – 1 A, B and C are matrices at offset i * stridea, i * strideb, i * stridec in a, b and c. - C = alpha * op(A) * op(B) + beta * C + C := alpha * op(A) * op(B) + beta * C end for where: - - op(X) is one of op(X) = X, or op(X) = X\ :sup:`T`, or op(X) = - X\ :sup:`H` - + op(X) is one of op(X) = X, or op(X) = X\ :sup:`T`, or op(X) = X\ :sup:`H` - - ``alpha`` and ``beta`` are scalars + ``alpha`` and ``beta`` are scalars - - ``A``, ``B``, and ``C`` are matrices + ``A``, ``B``, and ``C`` are matrices + + op(``A``) is ``m``\ ``x``\ ``k``, op(``B``) is ``k``\ ``x``\ ``n``, and ``C`` is ``m``\ ``x``\ ``n``. - - The a, b and c buffers contains all the input matrices. The stride - between matrices is either given by the exact size of the matrix - (for the group API) or by the stride parameter. The total number - of matrices in a, b and c buffers is given by the + + For group API, a, b and c arrays contain the pointers for all the input matrices. + The total number of matrices in a, b and c are given by: + + total_batch_count = sum of all of the group_size entries + + + For strided API, a, b, c arrays contain all the input matrices. The total number of matrices + in a, b and c are given by the ``batch_size`` parameter. - |image0| - - for the - group API or by the ``batch_size`` parameter for the strided API. + **Group API** +.. container:: section - Here, op(``A``) is ``m``\ ``x``\ ``k``, op(``B``) is - ``k``\ ``x``\ ``n``, and ``C`` is ``m``\ ``x``\ ``n``. + .. rubric:: Syntax + :class: sectiontitle -.. container:: section - :name: GUID-863264A0-4CE9-495F-A617-102E46D7A41A + .. container:: dlsyntaxpara + + .. cpp:function:: sycl::event onemkl::blas::gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, T *alpha, const T **a, std::int64_t *lda, const T **b, std::int64_t *ldb, T *beta, T **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, const sycl::vector_class &dependencies = {}) - .. rubric:: Input Parameters - Group API - :name: input-parameters---group-api - :class: sectiontitle +.. container:: section - transa_array - Buffer holding ``group_count onemkl::transpose`` value. + .. rubric:: Input Parameters + :class: sectiontitle - For the group ``i``, ``transa`` is the ``i``\ th element in the - transa_array buffer and specifies the form of ``op(A)`` used in - the matrix multiplication. See - :ref:`onemkl_datatypes` for more - details. + queue + The queue where the routine should be executed. - transb_array - Buffer holding ``group_count onemkl::transpose`` value. + transa + Array of ``group_count`` ``onemkl::transpose`` values. ``transa[i]`` specifies the form of ``op(A)`` used in + the matrix multiplication in group ``i``. See :ref:`onemkl_datatypes` for more details. - For the group ``i``, ``transb`` is the ``i``\ th element in the - transb_array buffer and specifies the form of ``op(B)`` used in - the matrix multiplication. See - :ref:`onemkl_datatypes` for more - details. + transb + Array of ``group_count`` ``onemkl::transpose`` values. ``transb[i]`` specifies the form of ``op(B)`` used in + the matrix multiplication in group ``i``. See :ref:`onemkl_datatypes` for more details. - m_array - Buffer holding ``group_count`` integer. For the group ``i``, ``m`` - is the ``i``\ th element in the m_array buffer and specifies the - number of rows of ``op(A)`` and ``C``. Must be at least zero. + + m + Array of ``group_count`` integers. ``m[i]`` specifies the + number of rows of ``op(A)`` and ``C`` for every matrix in group ``i``. All entries must be at least zero. - n_array - Buffer holding ``group_count`` integer. For the group ``i``, ``n`` - is the ``i``\ th element in the n_array buffer and specifies the - number of columns of ``op(B)`` and ``C``. Must be at least zero. + n + Array of ``group_count`` integers. ``n[i]`` specifies the + number of columns of ``op(B)`` and ``C`` for every matrix in group ``i``. All entries must be at least zero. - k_array - Buffer holding ``group_count`` integer. For the group ``i``, ``k`` - is the ``i``\ th element in the k_array buffer and specifies the - number of columns of ``op(A)`` and rows of ``op(B)``. Must be at + k + Array of ``group_count`` integers. ``k[i]`` specifies the + number of columns of ``op(A)`` and rows of ``op(B)`` for every matrix in group ``i``. All entries must be at least zero. - alpha_array - Buffer holding ``group_count`` scalar element. For the group - ``i``, ``alpha`` is the ``i``\ th element in the alpha_array - buffer and specifies the scaling factor for the matrix-matrix - product. + alpha + Array of ``group_count`` scalar elements. ``alpha[i]`` specifies the scaling factor for every matrix-matrix + product in group ``i``. a - Buffer holding the input matrices ``A``. The total size of the - buffer ``a`` must be at least the sum of the sizes of all the - matricies ``A``. That is, + Array of pointers to input matrices ``A`` with size ``total_batch_count``. + + See `Matrix Storage <../matrix-storage.html>`__ for more details. - |image1| + lda + Array of ``group_count`` integers. ``lda[i]`` specifies the leading dimension of ``A`` for every matrix in group ``i``. + All entries must be at least ``m`` + if ``A`` is not transposed, and at least ``k`` if ``A`` is + transposed. All entries must be positive. - where - ``sizeai = lda_array[i] * (transa == onemkl::transpose::N ? k : m)`` + b + Array of pointers to input matrices ``B`` with size ``total_batch_count``. + + See `Matrix Storage <../matrix-storage.html>`__ for more details. - See `Matrix - Storage <../matrix-storage.html>`__ for - more details. + ldb + Array of ``group_count`` integers. ``ldb[i]`` specifies the leading dimension of ``B`` for every matrix in group ``i``. + All entries must be at least ``k`` + if ``B`` is not transposed, and at least ``n`` if ``B`` is + transposed. All entries must be positive. - lda_array - Buffer holding ``group_count`` integer. For the group ``i``, - ``lda`` is the ``i``\ th element in the lda_array buffer and - specifies the leading dimension of ``A``. Must be at least ``m`` - if ``A`` is not transposed, and at least ``k`` if ``A`` is - transposed. Must be positive. + beta + Array of ``group_count`` scalar elements. ``beta[i]`` specifies the scaling factor for matrix ``C`` + for every matrix in group ``i``. - b - Buffer holding the input matrices ``B``. The total size of the - buffer ``b`` must be at least the sum of the sizes of all the - matricies ``B``. That is, + c + Array of pointers to input/output matrices ``C`` with size ``total_batch_count``. + + See `Matrix Storage <../matrix-storage.html>`__ for more details. + + + ldc + Array of ``group_count`` integers. ``ldc[i]`` specifies the leading dimension of ``C`` for every matrix in group ``i``. + All entries must be positive and at least ``m``. - |image2| + group_count + Specifies the number of groups. Must be at least 0. - where - ``sizebi = ldb_array[i] * (transb == onemkl::transpose::N ? n : k)`` + group_size + Array of ``group_count`` integers. ``group_size[i]`` specifies the + number of matrix multiply products in group ``i``. All entries must be at least 0. - See `Matrix - Storage <../matrix-storage.html>`__ for - more details. + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. - ldb_array - Buffer holding ``group_count`` integer. For the group ``i``, - ``ldb`` is the ``i``\ th element in the ldb_array buffer and - specifies the leading dimension of ``B``. Must be at least ``k`` - if ``B`` is not transposed, and at least ``n`` if ``B`` is - transposed. Must be positive. +.. container:: section - beta_array - Buffer holding ``group_count`` scalar element. For the group - ``i``, ``beta`` is the ``i``\ th element in the beta_array buffer - and specifies the scaling factor for matrix C. + .. rubric:: Output Parameters + :class: sectiontitle c - Buffer holding the input/output matrices ``C``. The total size of - the buffer ``c`` must be at least the sum of the sizes of all the - matricies ``C``. That is, + Overwritten by the ``m[i]``-by-``n[i]`` matrix calculated by + ``(alpha[i]*op(A)*op(B) + beta[i]*C)`` for group ``i``. - |image3| + .. container:: section - See `Matrix - Storage <../matrix-storage.html>`__ for - more details. + .. rubric:: Notes + :class: sectiontitle - ldc_array - Buffer holding ``group_count`` integer. For the group ``i``, - ``ldc`` is the ``i``\ th element in the ldc_array buffer and - specifies the leading dimension of ``C``. Must be positive and at - least ``m``. + If ``beta`` = 0, matrix ``C`` does not need to be initialized + before calling ``gemm_batch``. - group_count - Specifies the number of groups. Must be at least 0. + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle - group_size_array - Buffer holding ``group_count`` integer. For the group ``i``, the - ``i``\ th element in the group_size_array buffer specifies the - number of matrix multiply operations in group ``i``. Each element - in ``group_size_array`` must be at least 0. + Output event to wait on to ensure computation is complete. + + + **Strided API** + .. container:: section - :name: GUID-1E4953E6-F7B1-4FEE-BA5A-8C4BD51DC700 - .. rubric:: Output Parameters - Group API - :name: output-parameters---group-api + .. rubric:: Syntax :class: sectiontitle + .. container:: dlsyntaxpara - c - Overwritten by the ``m``\ :sub:`i`-by-``n``\ :sub:`i` matrix - ``(alphai*op(A)*op(B) + betai*C)`` for group ``i``. + .. cpp:function:: sycl::event onemkl::blas::gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, T alpha, const T *a, std::int64_t lda, std::int64_t stridea, const T *b, std::int64_t ldb, std::int64_t strideb, T beta, T *c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, const sycl::vector_class &dependencies = {}) .. container:: section - :name: GUID-D067773A-45A3-4D24-B10A-46E27834947E - .. rubric:: Input Parameters - Strided API - :name: input-parameters---strided-api + .. rubric:: Input Parameters :class: sectiontitle + queue + The queue where the routine should be executed. + + transa Specifies ``op(A)`` the transposition operation applied to the - matrices A. See - :ref:`onemkl_datatypes` for more - details. + matrices ``A``. See :ref:`onemkl_datatypes` for more details. + transb Specifies ``op(B)`` the transposition operation applied to the - matrices B. See - :ref:`onemkl_datatypes` for more - details. + matrices ``B``. See :ref:`onemkl_datatypes` for more details. m @@ -322,8 +449,7 @@ gemm_batch a - Buffer holding the input matrices ``A``. Must have size at least - ``stridea*batch_size``. + Pointer to input matrices ``A`` with size ``stridea*batch_size``. lda @@ -333,20 +459,11 @@ gemm_batch stridea - Stride between the different ``A`` matrices. - - - If ``A`` are not transposed, the matrices ``A`` are ``m``-by-``k`` - matrices so stridea must be at least ``lda*k``. - - - If ``A`` are transposed, the matrices ``A`` are ``k``-by-``m`` - matrices so stridea must be at least ``lda*m``. + Stride between different ``A`` matrices. b - Buffer holding the input matrices ``B``. Must have size at least - ``strideb*batch_size``. + Pointer to input matrices ``B`` with size ``strideb*batch_size``. ldb @@ -356,24 +473,16 @@ gemm_batch strideb - Stride between the different ``B`` matrices. - - - If ``B`` are not transposed, the matrices ``B`` are ``k``-by-``n`` - matrices so strideb must be at least ``ldb*n``. + Stride between different ``B`` matrices. - If ``B`` are transposed, the matrices ``B`` are ``n``-by-``k`` - matrices so strideb must be at least ``ldb*k``. - beta Scaling factor for the matrices ``C``. c - Buffer holding input/output matrices ``C``. Must have size at - least ``stridec*batch_size``. + Pointer to input/output matrices ``C`` with size ``stridec*batch_size``. ldc @@ -381,60 +490,57 @@ gemm_batch stridec - Stride between the different ``C`` matrices. Must be at least - ``ldc*n``. + Stride between different ``C`` matrices. batch_size Specifies the number of matrix multiply operations to perform. + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + .. container:: section - :name: GUID-98C3DE17-4F5F-41A1-B431-48148153ABBA - .. rubric:: Output Parameters - Strided API - :name: output-parameters---strided-api + .. rubric:: Output Parameters :class: sectiontitle c - Output buffer, overwritten by ``batch_size`` matrix multiply - operations of the form\ ``alpha*op(A)*op(B) + beta*C``. + Output matrices, overwritten by ``batch_size`` matrix multiply + operations of the form ``alpha*op(A)*op(B) + beta*C``. .. container:: section - :name: GUID-AC72653A-4AC8-4B9D-B7A9-13A725AA19BF .. rubric:: Notes - :name: notes :class: sectiontitle If ``beta`` = 0, matrix ``C`` does not need to be initialized before - calling gemm_batch. + calling ``gemm_batch``. -.. container:: familylinks +.. container:: section - .. container:: parentlink + .. rubric:: Return Values + :class: sectiontitle - **Parent topic:** :ref:`blas-like-extensions` - + Output event to wait on to ensure computation is complete. -.. container:: +.. container:: familylinks + + + .. container:: parentlink -.. |image0| image:: ../equations/GUID-D797E8FA-B0CE-417C-98F1-896CDFB4Fee1.png - :class: img-middle -.. |image1| image:: ../equations/GUID-D797E8FA-B0CE-417C-98F1-896CDFB4Fee2.png - :class: img-middle -.. |image2| image:: ../equations/GUID-D797E8FA-B0CE-417C-98F1-896CDFB4Fee3.png - :class: img-middle -.. |image3| image:: ../equations/GUID-D797E8FA-B0CE-417C-98F1-896CDFB4Fee4.png - :class: img-middle + **Parent topic:** :ref:`blas-like-extensions` + diff --git a/docs/domains/blas/gemm_ext.rst b/docs/domains/blas/gemm_ext.rst index a49fabe8d..0d2d3c042 100644 --- a/docs/domains/blas/gemm_ext.rst +++ b/docs/domains/blas/gemm_ext.rst @@ -1,326 +1,316 @@ -.. _gemm_ext: +.. _onemkl_blas_gemm_ext: gemm_ext ======== - .. container:: Computes a matrix-matrix product with general matrices. - .. container:: section - :name: GUID-7885D940-FAC1-4F37-9E1C-A022DED99EBD - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - **Standard API** - - - .. container:: dlsyntaxpara - - - .. cpp:function:: void gemm_ext(queue &exec_queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, Ts alpha, buffer &a, std::int64_t lda, buffer &b, std::int64_t ldb, Ts beta, buffer &c, std::int64_t ldc) - - ``gemm_ext`` supports the following precisions and devices. - - - .. list-table:: - :header-rows: 1 - - * - Ts - - Ta - - Tb - - Tc - * - ``float`` - - ``half`` - - ``half`` - - ``float`` - * - ``half`` - - ``half`` - - ``half`` - - ``half`` - * - ``float`` - - ``float`` - - ``float`` - - ``float`` - * - ``double`` - - ``double`` - - ``double`` - - ``double`` - * - ``std::complex`` - - ``std::complex`` - - ``std::complex`` - - ``std::complex`` - * - ``std::complex`` - - ``std::complex`` - - ``std::complex`` - - ``std::complex`` - + **Standard API** + + ``gemm_ext`` supports the following precisions and devices. - **Offset API** + .. list-table:: + :header-rows: 1 + * - Ts + - Ta + - Tb + - Tc + * - ``float`` + - ``half`` + - ``half`` + - ``float`` + * - ``half`` + - ``half`` + - ``half`` + - ``half`` + * - ``float`` + - ``float`` + - ``float`` + - ``float`` + * - ``double`` + - ``double`` + - ``double`` + - ``double`` + * - ``std::complex`` + - ``std::complex`` + - ``std::complex`` + - ``std::complex`` + * - ``std::complex`` + - ``std::complex`` + - ``std::complex`` + - ``std::complex`` - .. container:: dlsyntaxpara + **Offset API** - .. cpp:function:: void gemm_ext(queue &exec_queue, transpose transa, transpose transb, offset offset_type, std::int64_t m, std::int64_t n, std::int64_t k, Ts alpha, buffer &a, std::int64_t lda, Ta ao, buffer &b, std::int64_t ldb, Tb bo, Ts beta, buffer &c, std::int64_t ldc, buffer &co) - ``gemm_ext`` supports the following precisions. + ``gemm_ext`` supports the following precisions. - .. list-table:: - :header-rows: 1 + .. list-table:: + :header-rows: 1 - * - Ts - - Ta - - Tb - - Tc - * - ``float`` - - ``int8_t`` - - ``uint8_t`` - - ``int32_t`` + * - Ts + - Ta + - Tb + - Tc + * - ``float`` + - ``int8_t`` + - ``uint8_t`` + - ``int32_t`` +.. container:: section - .. container:: section - :name: GUID-14237C95-6322-47A4-BC11-D3CDD2118C42 + .. rubric:: Description + :class: sectiontitle - .. rubric:: Description - :name: description - :class: sectiontitle + The gemm_ext routines compute a scalar-matrix-matrix product and + add the result to a scalar-matrix product, with general matrices. + + For Standard API, the operation is defined as: + :: - The gemm_ext routines compute a scalar-matrix-matrix product and - add the result to a scalar-matrix product, with general matrices. - The operation is defined as: + C ← alpha*op(A)*op(B) + beta*C - :: + For Offset API, the operation is defined as: - C ← alpha*op(A)*op(B) + beta*C + :: - for the standard API and - :: + C ← alpha*(op(A) - A_offset)*(op(B) - B_offset) + beta*C + C_offset + where: - C ← alpha*(op(A) - A_offset)*(op(B) - B_offset) + beta*C + C_offset + op(X) is one of op(X) = X, or op(X) = X\ :sup:`T`, or op(X) = X\ :sup:`H` - for the offset API - where: + ``alpha`` and ``beta`` are scalars - - op(X) is one of op(X) = X, or op(X) = X\ :sup:`T`, or op(X) = - X\ :sup:`H` + ``A_offset`` is an ``m``-by-``k`` matrix with every element equal to the value ao - - ``alpha`` and ``beta`` are scalars + ``B_offset`` is a ``k``-by-``n`` matrix with every element equal to the value bo - - ``A_offset`` is an ``m``-by-``k`` matrix with every element - equal to the value ao + ``C_offset`` is an ``m``-by-``n`` matrix defined by the + co buffer as described below. - - ``B_offset`` is a ``k``-by-``n`` matrix with every element - equal to the value bo + ``A``, ``B``, and ``C`` are matrices - - ``C_offset`` is an ``m``-by-``n`` matrix defined by the co - buffer as described in - :ref:`onemkl_datatypes` + op(``A``) is ``m`` x ``k``, op(``B``) is ``k`` x ``n``, and + ``C`` is ``m`` x ``n``. - - ``A``, ``B``, and ``C`` are matrices +gemm_ext (Buffer Version) +------------------------- - Here, op(``A``) is ``m`` x ``k``, op(``B``) is ``k`` x ``n``, and - ``C`` is ``m`` x ``n``. - +.. container:: .. container:: section - :name: GUID-863264A0-4CE9-495F-A617-102E46D7A41A - + .. rubric:: Syntax + :class: sectiontitle + + + **Standard API** + + + .. container:: dlsyntaxpara + + + .. cpp:function:: void onemkl::blas::gemm_ext(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, Ts alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &b, std::int64_t ldb, Ts beta, sycl::buffer &c, std::int64_t ldc) + + + **Offset API** + + + .. container:: dlsyntaxpara + + + .. cpp:function:: void onemkl::blas::gemm_ext(sycl::queue &queue, transpose transa, transpose transb, offset offset_type, std::int64_t m, std::int64_t n, std::int64_t k, Ts alpha, sycl::buffer &a, std::int64_t lda, Ta ao, sycl::buffer &b, std::int64_t ldb, Tb bo, Ts beta, sycl::buffer &c, std::int64_t ldc, sycl::buffer &co) + + + .. container:: section + + .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - - - exec_queue + + + queue The queue where the routine should be executed. - - + + transa Specifies op(``A``), the transposition operation applied to ``A``. See :ref:`onemkl_datatypes` for more details. - - + + + transb Specifies op(``B``), the transposition operation applied to ``B``. See :ref:`onemkl_datatypes` for more details. - - + + + offset_type (offset API only) Specifies the form of ``C_offset`` used in the matrix multiplication. See :ref:`onemkl_datatypes` for more details. - - + + m Number of rows of op(``A``) and ``C``. Must be at least zero. - - + + n Number of columns of op(``B``) and ``C``. Must be at least zero. - - + + k Number of columns of op(``A``) and rows of op(``B``). Must be at least zero. - - + + alpha Scaling factor for the matrix-matrix product. - - + + a Buffer holding the input matrix ``A``. - - + + If ``A`` is not transposed, ``A`` is an ``m``-by-``k`` matrix so the array ``a`` must have size at least ``lda``\ \*\ ``k``. - - + + If ``A`` is transposed, ``A`` is a ``k``-by-``m`` matrix so the array ``a`` must have size at least ``lda``\ \*\ ``m``. - - + + See `Matrix Storage <../matrix-storage.html>`__ for more details. - - + + lda Leading dimension of ``A``. Must be at least ``m`` if ``A`` is not transposed, and at least ``k`` if ``A`` is transposed. Must be positive. - - + + ao (offset API only) Specifies the scalar offset value for matrix ``A``. - - + + b Buffer holding the input matrix ``B``. - - + + If ``B`` is not transposed, ``B`` is a ``k``-by-``n`` matrix so the array ``b`` must have size at least ``ldb``\ \*\ ``n``. - - + + If ``B`` is transposed, ``B`` is an ``n``-by-``k`` matrix so the array ``b`` must have size at least ``ldb``\ \*\ ``k``. - - + + See `Matrix Storage <../matrix-storage.html>`__ for more details. - - + + ldb Leading dimension of ``B``. Must be at least ``k`` if ``B`` is not transposed, and at least ``n`` if ``B`` is transposed. Must be positive. - - + + bo (offset API only) Specifies the scalar offset value for matrix ``B``. - - + + beta Scaling factor for matrix ``C``. - - + + c - Buffer holding the input matrix ``C``. Must have size at least + Buffer holding the input/output matrix ``C``. Must have size at least ``ldc`` \* ``n``. See `Matrix Storage <../matrix-storage.html>`__ for more details. - - + + ldc Leading dimension of ``C``. Must be positive and at least ``m``. - - + + co (offset API only) Buffer holding the offset values for matrix ``C``. - - + + If ``offset_type = offset::fix``, the ``co`` array must have size at least 1. - - + + If ``offset_type = offset::col``, the ``co`` array must have size at least ``max(1,m)``. - - + + If ``offset_type = offset::row``, the ``co`` array must have - size at least ``max(1,n)``. - - - See - :ref:`onemkl_datatypes` for - more details. - - + size at least ``max(1,n)``. + + .. container:: section - :name: GUID-1E4953E6-F7B1-4FEE-BA5A-8C4BD51DC700 - - + + .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle - - + + c Output buffer, overwritten by alpha\*op(``A``)*op(``B``) + beta\*\ ``C`` for the standard API and alpha\*(op(``A``) - ``A_offset``)*(op(``B``) - ``B_offset``) + beta\*\ ``C`` + ``C_offset`` for the offset API. - - + + .. container:: section - :name: GUID-AC72653A-4AC8-4B9D-B7A9-13A725AA19BF - - + + .. rubric:: Notes - :name: notes :class: sectiontitle - - + + If ``beta`` = 0, matrix ``C`` does not need to be initialized before calling gemm_ext. + .. container:: familylinks @@ -331,5 +321,3 @@ gemm_ext -.. container:: - diff --git a/docs/domains/blas/gemmt.rst b/docs/domains/blas/gemmt.rst index e2ceb077d..7954218fd 100644 --- a/docs/domains/blas/gemmt.rst +++ b/docs/domains/blas/gemmt.rst @@ -1,4 +1,4 @@ -.. _gemmt: +.. _onemkl_blas_gemmt: gemmt ===== @@ -11,19 +11,6 @@ gemmt only the upper or lower triangular part of the result matrix. - .. container:: section - :name: GUID-7885D940-FAC1-4F37-9E1C-A022DED99EBD - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. container:: dlsyntaxpara - - - .. cpp:function:: void gemmt(queue &exec_queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, T alpha, buffer &a, std::int64_t lda, buffer &b, std::int64_t ldb, T beta, buffer &c, std::int64_t ldc) ``gemmt`` supports the following precisions. @@ -41,11 +28,9 @@ gemmt .. container:: section - :name: GUID-14237C95-6322-47A4-BC11-D3CDD2118C42 .. rubric:: Description - :name: description :class: sectiontitle @@ -63,30 +48,180 @@ gemmt where: - - op(X) is one of op(X) = X, or op(X) = X\ :sup:`T`, or op(X) = - X\ :sup:`H` + op(X) is one of op(X) = X, or op(X) = X\ :sup:`T`, or op(X) = X\ :sup:`H` - - ``alpha`` and ``beta`` are scalars + ``alpha`` and ``beta`` are scalars - - ``A``, ``B``, and ``C`` are matrices + ``A``, ``B``, and ``C`` are matrices - Here, op(``A``) is ``n`` x ``k``, op(``B``) is ``k`` x ``n``, and + op(``A``) is ``n`` x ``k``, op(``B``) is ``k`` x ``n``, and ``C`` is ``n`` x ``n``. +gemmt (Buffer Version) +---------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. container:: dlsyntaxpara + + + .. cpp:function:: void onemkl::blas::gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, T alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &b, std::int64_t ldb, T beta, sycl::buffer &c, std::int64_t ldc) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + upper_lower + Specifies whether ``C``\ ’s data is stored in its upper or + lower triangle. See :ref:`onemkl_datatypes` for more details. + + + transa + Specifies op(``A``), the transposition operation applied to + ``A``. See :ref:`onemkl_datatypes` for more details. + + + transb + Specifies op(``B``), the transposition operation applied to + ``B``. See :ref:`onemkl_datatypes` for more details. + + + n + Number of columns of op(``A``), columns of op(``B``), and + columns of\ ``C``. Must be at least zero. + + + k + Number of columns of op(``A``) and rows of op(``B``). Must be + at least zero. + + + alpha + Scaling factor for the matrix-matrix product. + + + a + Buffer holding the input matrix ``A``. + + + If ``A`` is not transposed, ``A`` is an ``n``-by-``k`` matrix + so the array ``a`` must have size at least ``lda``\ \*\ ``k``. + + + If ``A`` is transposed, ``A`` is a ``k``-by-``n`` matrix so the + array ``a`` must have size at least ``lda``\ \*\ ``n``. + + + See `Matrix Storage <../matrix-storage.html>`__ for more details. + + + lda + Leading dimension of ``A``. Must be at least ``n`` if ``A`` is + not transposed, and at least ``k`` if ``A`` is transposed. Must + be positive. + + + b + Buffer holding the input matrix ``B``. + + + If ``B`` is not transposed, ``B`` is a ``k``-by-``n`` matrix so + the array ``b`` must have size at least ``ldb``\ \*\ ``n``. + + + If ``B`` is transposed, ``B`` is an ``n``-by-``k`` matrix so + the array ``b`` must have size at least ``ldb``\ \*\ ``k``. + + + See `Matrix Storage <../matrix-storage.html>`__ for more details. + + + ldb + Leading dimension of ``B``. Must be at least ``k`` if ``B`` is + not transposed, and at least ``n`` if ``B`` is transposed. Must + be positive. + + + beta + Scaling factor for matrix ``C``. + + + c + Buffer holding the input/output matrix ``C``. Must have size at + least ``ldc`` \* ``n``. See `Matrix + Storage <../matrix-storage.html>`__ for + more details. + + + ldc + Leading dimension of ``C``. Must be positive and at least + ``m``. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + c + Output buffer, overwritten by the upper or lower triangular + part of alpha\*op(``A``)*op(``B``) + beta\*\ ``C``. + + + .. container:: section + + + .. rubric:: Notes + :class: sectiontitle + + + If ``beta`` = 0, matrix ``C`` does not need to be initialized + before calling gemmt. + + +gemmt (USM Version) +------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, T alpha, const T* a, std::int64_t lda, const T* b, std::int64_t ldb, T beta, T* c, std::int64_t ldc, const sycl::vector_class &dependencies = {}) .. container:: section - :name: GUID-863264A0-4CE9-495F-A617-102E46D7A41A .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. @@ -97,6 +232,7 @@ gemmt more details. + transa Specifies op(``A``), the transposition operation applied to ``A``. See @@ -104,12 +240,14 @@ gemmt more details. + transb Specifies op(``B``), the transposition operation applied to ``B``. See :ref:`onemkl_datatypes` for more details. + n Number of columns of op(``A``), columns of op(``B``), and @@ -126,7 +264,7 @@ gemmt a - Buffer holding the input matrix ``A``. + Pointer to input matrix ``A``. If ``A`` is not transposed, ``A`` is an ``n``-by-``k`` matrix @@ -149,7 +287,7 @@ gemmt b - Buffer holding the input matrix ``B``. + Pointer to input matrix ``B``. If ``B`` is not transposed, ``B`` is a ``k``-by-``n`` matrix so @@ -176,8 +314,8 @@ gemmt c - Buffer holding the input/output matrix ``C``. Must have size at - least ``ldc`` \* ``n``. See `Matrix + Pointer to input/output matrix ``C``. Must have size at least + ``ldc`` \* ``n``. See `Matrix Storage <../matrix-storage.html>`__ for more details. @@ -187,26 +325,27 @@ gemmt ``m``. + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + .. container:: section - :name: GUID-1E4953E6-F7B1-4FEE-BA5A-8C4BD51DC700 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle c - Output buffer, overwritten by the upper or lower triangular - part ofalpha\*op(``A``)*op(``B``) + beta\*\ ``C``. + Pointer to the output matrix, overwritten by the upper or lower + triangular part of alpha\*op(``A``)*op(``B``) + beta\*\ ``C``. .. container:: section - :name: GUID-AC72653A-4AC8-4B9D-B7A9-13A725AA19BF .. rubric:: Notes - :name: notes :class: sectiontitle @@ -214,15 +353,20 @@ gemmt before calling gemmt. -.. container:: familylinks + .. container:: section - .. container:: parentlink + .. rubric:: Return Values + :class: sectiontitle - **Parent topic:** :ref:`blas-like-extensions` - + Output event to wait on to ensure computation is complete. -.. container:: +.. container:: familylinks + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-like-extensions` diff --git a/docs/domains/blas/gemv.rst b/docs/domains/blas/gemv.rst index 1345bcdf0..48a1b439c 100644 --- a/docs/domains/blas/gemv.rst +++ b/docs/domains/blas/gemv.rst @@ -1,4 +1,4 @@ -.. _gemv: +.. _onemkl_blas_gemv: gemv ==== @@ -10,18 +10,8 @@ gemv Computes a matrix-vector product using a general matrix. - .. container:: section - :name: GUID-EA8D6705-E7C2-42E2-BE80-D9AD83645FCC - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - .. cpp:function:: void gemv(queue &exec_queue, transpose trans, std::int64_t m, std::int64_t n, T alpha, buffer &a, std::int64_t lda, buffer &x, std::int64_t incx, T beta, buffer &y, std::int64_t incy) - - gemv supports the following precisions. + ``gemv`` supports the following precisions. .. list-table:: @@ -37,15 +27,13 @@ gemv .. container:: section - :name: GUID-AE220EED-6066-4881-8B3C-35207BAB0105 .. rubric:: Description - :name: description :class: sectiontitle - The gemv routines compute a scalar-matrix-vector product and add the + The ``gemv`` routines compute a scalar-matrix-vector product and add the result to a scalar-vector product, with a general matrix. The operation is defined as @@ -69,24 +57,32 @@ gemv ``A`` is an ``m``-by-``n`` matrix, and ``x``, ``y`` are vectors. +gemv (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, T alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &x, std::int64_t incx, T beta, sycl::buffer &y, std::int64_t incy) .. container:: section - :name: GUID-F3E8F201-6033-45A1-A326-CA4CFB631C3A .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. trans Specifies ``op(A)``, the transposition operation applied to ``A``. - See - :ref:`onemkl_datatypes` for more - details. m @@ -105,7 +101,7 @@ gemv a The buffer holding the input matrix ``A``. Must have a size of at - least ``lda``\ \*n. See `Matrix and Vector + least ``lda``\ \*``n``. See `Matrix and Vector Storage <../matrix-storage.html>`__ for more details. @@ -147,11 +143,9 @@ gemv .. container:: section - :name: GUID-1533BCA6-E652-4A08-A82D-162F3CEBDD29 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -159,16 +153,130 @@ gemv The buffer holding updated vector ``y``. +gemv (USM Version) +------------------ -.. container:: familylinks +.. container:: + .. container:: section - .. container:: parentlink + .. rubric:: Syntax + :class: sectiontitle - **Parent topic:** :ref:`blas-level-2-routines` - + .. container:: dlsyntaxpara -.. container:: + .. cpp:function:: sycl::event onemkl::blas::gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, T alpha, const T *a, std::int64_t lda, const T *x, std::int64_t incx, T beta, T *y, std::int64_t incy, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + trans + Specifies ``op(A)``, the transposition operation applied to + ``A``. See + :ref:`onemkl_datatypes` for + more details. + + + + m + Specifies the number of rows of the matrix ``A``. The value of + ``m`` must be at least zero. + + + n + Specifies the number of columns of the matrix ``A``. The value + of ``n`` must be at least zero. + + + alpha + Scaling factor for the matrix-vector product. + + + a + The pointer to the input matrix ``A``. Must have a size of at + least ``lda``\ \*``n``. See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + lda + The leading dimension of matrix ``A``. It must be at least m, + and positive. + + + x + Pointer to the input vector ``x``. The length ``len`` of vector + ``x`` is ``n`` if ``A`` is not transposed, and ``m`` if ``A`` + is transposed. The array holding vector ``x`` must be of size + at least (1 + (``len`` - 1)*abs(``incx``)). See `Matrix and + Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + The stride of vector ``x``. + + + beta + The scaling factor for vector ``y``. + + + y + Pointer to input/output vector ``y``. The length ``len`` of + vector ``y`` is ``m``, if ``A`` is not transposed, and ``n`` if + ``A`` is transposed. The array holding input/output vector + ``y`` must be of size at least (1 + (``len`` - + 1)*abs(``incy``)) where ``len`` is this length. See `Matrix and + Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incy + The stride of vector ``y``. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + y + The pointer to updated vector ``y``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-2-routines` diff --git a/docs/domains/blas/ger.rst b/docs/domains/blas/ger.rst index 1a122ce84..1877a5e9d 100644 --- a/docs/domains/blas/ger.rst +++ b/docs/domains/blas/ger.rst @@ -1,4 +1,4 @@ -.. _ger: +.. _onemkl_blas_ger: ger === @@ -10,16 +10,6 @@ ger Computes a rank-1 update of a general matrix. - .. container:: section - :name: GUID-0DA23698-EB19-4AAF-A5FD-9BB530A9EFE0 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void ger(queue &exec_queue, std::int64_t m, std::int64_t n, T alpha, buffer &x, std::int64_t incx, buffer &y, std::int64_t incy, buffer &a, std::int64_t lda) ``ger`` supports the following precisions. @@ -35,21 +25,16 @@ ger .. container:: section - :name: GUID-72E035B0-E1C2-442B-AE9D-2CB873E90FAF .. rubric:: Description - :name: description :class: sectiontitle - The ger routines compute a scalar-vector-vector product and add the + The ``ger`` routines compute a scalar-vector-vector product and add the result to a general matrix. The operation is defined as - - - A <- alpha*x*y :sup:`T` + A @@ -68,16 +53,27 @@ ger ``y`` is a vector length ``n``. +ger (Buffer Version) +-------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::ger(sycl::queue &queue, std::int64_t m, std::int64_t n, T alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, std::int64_t incy, sycl::buffer &a, std::int64_t lda) .. container:: section - :name: GUID-6953A2E5-0065-425C-986B-15966C793067 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. @@ -128,11 +124,9 @@ ger .. container:: section - :name: GUID-E2A13688-1D12-4DD0-9752-3557E980ACC0 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -140,15 +134,111 @@ ger Buffer holding the updated matrix ``A``. -.. container:: familylinks +ger (USM Version) +----------------- +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-2-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::ger(sycl::queue &queue, std::int64_t m, std::int64_t n, T alpha, const T *x, std::int64_t incx, const T *y, std::int64_t incy, T *a, std::int64_t lda, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + m + Number of rows of ``A``. Must be at least zero. + + + n + Number of columns of ``A``. Must be at least zero. + + alpha + Scaling factor for the matrix-vector product. + + + x + Pointer to input vector ``x``. The array holding input vector + ``x`` must be of size at least (1 + (``m`` - 1)*abs(``incx``)). + See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + y + Pointer to input/output vector ``y``. The array holding + input/output vector ``y`` must be of size at least (1 + (``n`` + - 1)*abs(``incy``)). See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incy + Stride of vector ``y``. + + + a + Pointer to input matrix ``A``. Must have size at least + ``lda``\ \*\ ``n``. See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + lda + Leading dimension of matrix ``A``. Must be at least ``m``, and + positive. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + a + Pointer to the updated matrix ``A``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-2-routines` diff --git a/docs/domains/blas/gerc.rst b/docs/domains/blas/gerc.rst index 5a9c772ac..577da3c89 100644 --- a/docs/domains/blas/gerc.rst +++ b/docs/domains/blas/gerc.rst @@ -1,4 +1,4 @@ -.. _gerc: +.. _onemkl_blas_gerc: gerc ==== @@ -10,16 +10,6 @@ gerc Computes a rank-1 update (conjugated) of a general complex matrix. - .. container:: section - :name: GUID-5A1B0292-28F6-45EB-95C4-FDA03D8D5062 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void gerc(queue &exec_queue, std::int64_t m, std::int64_t n, T alpha, buffer &x, std::int64_t incx, buffer &y, std::int64_t incy, buffer &a, std::int64_t lda) ``gerc`` supports the following precisions. @@ -35,15 +25,13 @@ gerc .. container:: section - :name: GUID-6CB627E5-A9C7-488D-8366-E7944A5C889E .. rubric:: Description - :name: description :class: sectiontitle - The gerc routines compute a scalar-vector-vector product and add the + The ``gerc`` routines compute a scalar-vector-vector product and add the result to a general matrix. The operation is defined as @@ -68,16 +56,27 @@ gerc ``y`` is vector of length ``n``. +gerc (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, T alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, std::int64_t incy, sycl::buffer &a, std::int64_t lda) .. container:: section - :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. @@ -128,27 +127,122 @@ gerc .. container:: section - :name: GUID-48944ED2-C10F-4B64-A91A-C9050AD24A92 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle a - Buffer holding the updated matrix *A*. + Buffer holding the updated matrix ``A``. -.. container:: familylinks +gerc (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-2-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, T alpha, const T *x, std::int64_t incx, const T *y, std::int64_t incy, T *a, std::int64_t lda, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + m + Number of rows of ``A``. Must be at least zero. + + + n + Number of columns of ``A``. Must be at least zero. + + + alpha + Scaling factor for the matrix-vector product. + + x + Pointer to the input vector ``x``. The array holding input + vector ``x`` must be of size at least (1 + (``m`` - + 1)*abs(``incx``)). See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + y + Pointer to the input/output vector ``y``. The array holding the + input/output vector ``y`` must be of size at least (1 + (``n`` + - 1)*abs(``incy``)). See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incy + Stride of vector ``y``. + + + a + Pointer to input matrix ``A``. The array holding input matrix + ``A``\ ust have size at least ``lda``\ \*\ ``n``. See `Matrix + and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + lda + Leading dimension of matrix ``A``. Must be at least ``m``, and + positive. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + a + Pointer to the updated matrix ``A``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-2-routines` diff --git a/docs/domains/blas/geru.rst b/docs/domains/blas/geru.rst index 121e2d13c..2bce5cfce 100644 --- a/docs/domains/blas/geru.rst +++ b/docs/domains/blas/geru.rst @@ -1,4 +1,4 @@ -.. _geru: +.. _onemkl_blas_geru: geru ==== @@ -10,16 +10,6 @@ geru Computes a rank-1 update (unconjugated) of a general complex matrix. - .. container:: section - :name: GUID-5942D28E-EDD6-4759-B19E-FBB51F35125B - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void geru(queue &exec_queue, std::int64_t m, std::int64_t n, T alpha, buffer &x, std::int64_t incx, buffer &y, std::int64_t incy, buffer &a, std::int64_t lda) ``geru`` supports the following precisions. @@ -35,21 +25,16 @@ geru .. container:: section - :name: GUID-75ECE219-BA77-48E8-B13B-FB504DD60CD4 .. rubric:: Description - :name: description :class: sectiontitle - The geru routines routines compute a scalar-vector-vector product and + The ``geru`` routines routines compute a scalar-vector-vector product and add the result to a general matrix. The operation is defined as - - - A <- alpha*x*y :sup:`T` + A @@ -68,16 +53,27 @@ geru ``y`` is a vector of length ``n``. +geru (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::geru(sycl::queue &queue, std::int64_t m, std::int64_t n, T alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, std::int64_t incy, sycl::buffer &a, std::int64_t lda) .. container:: section - :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. @@ -128,11 +124,9 @@ geru .. container:: section - :name: GUID-6E9315E9-DDCF-485D-8BDF-AB4BF8448BE1 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -140,15 +134,112 @@ geru Buffer holding the updated matrix ``A``. -.. container:: familylinks +geru (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-2-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::geru(sycl::queue &queue, std::int64_t m, std::int64_t n, T alpha, const T *x, std::int64_t incx, const T *y, std::int64_t incy, T *a, std::int64_t lda, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + m + Number of rows of ``A``. Must be at least zero. + + + n + Number of columns of ``A``. Must be at least zero. + + alpha + Scaling factor for the matrix-vector product. + + + x + Pointer to the input vector ``x``. The array holding input + vector ``x`` must be of size at least (1 + (``m`` - + 1)*abs(``incx``)). See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + y + Pointer to input/output vector ``y``. The array holding + input/output vector ``y`` must be of size at least (1 + (``n`` + - 1)*abs(``incy``)). See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incy + Stride of vector ``y``. + + + a + Pointer to input matrix ``A``. The array holding input matrix + ``A`` must have size at least ``lda``\ \*\ ``n``. See `Matrix + and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + lda + Leading dimension of matrix ``A``. Must be at least ``m``, and + positive. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + a + Pointer to the updated matrix ``A``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-2-routines` diff --git a/docs/domains/blas/hbmv.rst b/docs/domains/blas/hbmv.rst index e23481a2a..48b60d504 100644 --- a/docs/domains/blas/hbmv.rst +++ b/docs/domains/blas/hbmv.rst @@ -1,4 +1,4 @@ -.. _hbmv: +.. _onemkl_blas_hbmv: hbmv ==== @@ -10,17 +10,6 @@ hbmv Computes a matrix-vector product using a Hermitian band matrix. - .. container:: section - :name: GUID-F5FF420B-922B-4552-8F55-6EBCA7177881 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void hbmv(queue &exec_queue, uplo upper_lower, std::int64_t n, std::int64_t k, T alpha, buffer &a, std::int64_t lda, buffer &x, std::int64_t incx, T beta, buffer &y, std::int64_t incy) - ``hbmv`` supports the following precisions. @@ -35,22 +24,17 @@ hbmv .. container:: section - :name: GUID-8AB4BAC9-8124-4B52-8C15-1BC673820EB9 .. rubric:: Description - :name: description :class: sectiontitle - The hbmv routines compute a scalar-matrix-vector product and add the + The ``hbmv`` routines compute a scalar-matrix-vector product and add the result to a scalar-vector product, with a Hermitian band matrix. The operation is defined as - - - y <- alpha*A*x + beta*y @@ -67,23 +51,33 @@ hbmv ``x`` and ``y`` are vectors of length ``n``. +hbmv (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::hbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, T alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &x, std::int64_t incx, T beta, sycl::buffer &y, std::int64_t incy) .. container:: section - :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. upper_lower - Specifies whether ``A`` is upper or lower triangular. See - :ref:`onemkl_datatypes` for more - details. + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. + n @@ -138,11 +132,9 @@ hbmv .. container:: section - :name: GUID-7261182A-450B-46F5-8C61-7133597D3530 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -150,15 +142,122 @@ hbmv Buffer holding the updated vector ``y``. -.. container:: familylinks +hbmv (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-2-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::hbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, T alpha, const T *a, std::int64_t lda, const T *x, std::int64_t incx, T beta, T *y, std::int64_t incy, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + upper_lower + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. + + + + n + Number of rows and columns of ``A``. Must be at least zero. + + k + Number of super-diagonals of the matrix ``A``. Must be at least + zero. + + + alpha + Scaling factor for the matrix-vector product. + + + a + Pointer to the input matrix ``A``. The array holding input + matrix ``A`` must have size at least ``lda``\ \*\ ``n``. See + `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + lda + Leading dimension of matrix ``A``. Must be at least (``k`` + + 1), and positive. + + + x + Pointer to input vector ``x``. The array holding input vector + ``x`` must be of size at least (1 + (``m`` - 1)*abs(``incx``)). + See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + beta + Scaling factor for vector ``y``. + + + y + Pointer to input/output vector ``y``. The array holding + input/output vector ``y`` must be of size at least (1 + (``n`` + - 1)*abs(``incy``)). See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incy + Stride of vector ``y``. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + y + Pointer to the updated vector ``y``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-2-routines` diff --git a/docs/domains/blas/hemm.rst b/docs/domains/blas/hemm.rst index bc597e228..6b9729998 100644 --- a/docs/domains/blas/hemm.rst +++ b/docs/domains/blas/hemm.rst @@ -1,4 +1,4 @@ -.. _hemm: +.. _onemkl_blas_hemm: hemm ==== @@ -11,18 +11,8 @@ hemm and one is general. - .. container:: section - :name: GUID-F06C86BA-4F57-4608-B0D7-F7B920F867D7 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void hemm(queue &exec_queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, T alpha, buffer &a, std::int64_t lda, buffer &b, std::int64_t ldb, T beta, buffer &c, std::int64_t ldc) - hemm supports the following precisions: + ``hemm`` supports the following precisions: .. list-table:: @@ -36,15 +26,13 @@ hemm .. container:: section - :name: GUID-835E7F58-406E-444F-9DFD-121B84C22284 .. rubric:: Description - :name: description :class: sectiontitle - The hemm routines compute a scalar-matrix-matrix product and add the + The ``hemm`` routines compute a scalar-matrix-matrix product and add the result to a scalar-matrix product, where one of the matrices in the multiplication is Hermitian. The argument ``left_right`` determines if the Hermitian matrix, ``A``, is on the left of the multiplication @@ -53,18 +41,11 @@ hemm defined as - - - C <- alpha*A*B + beta*C - or - - - C <- alpha*B*A + beta*C @@ -81,31 +62,40 @@ hemm ``B`` and ``C`` are ``m``-by-``n`` matrices. +hemm (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, T alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &b, std::int64_t ldb, T beta, sycl::buffer &c, std::int64_t ldc) .. container:: section - :name: GUID-922C5F92-38B2-457B-B6C7-3CDD0531F97D .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. left_right Specifies whether ``A`` is on the left side of the multiplication - (``side::left``) or on the right side (``side::right``). See - :ref:`onemkl_datatypes` for more - details. + (``side::left``) or on the right side (``side::right``). See :ref:`onemkl_datatypes` for more details. + uplo Specifies whether ``A``'s data is stored in its upper or lower - triangle. See - :ref:`onemkl_datatypes` for more - details. + triangle. See :ref:`onemkl_datatypes` for more details. + m @@ -167,19 +157,11 @@ hemm .. container:: section - :name: GUID-94385C78-968D-4C03-AA5C-7379D5607800 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle - - -   - - - c Output buffer, overwritten by ``alpha``\ \*\ ``A``\ \*\ ``B`` + ``beta``\ \*\ ``C`` (``left_right`` = ``side::left``) or @@ -188,11 +170,9 @@ hemm .. container:: section - :name: EXAMPLE_5EF48B8A07D849EA84A74FE22F0D5B24 .. rubric:: Notes - :name: notes :class: sectiontitle @@ -200,15 +180,152 @@ hemm calling ``hemm``. -.. container:: familylinks +hemm (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-3-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, T alpha, const T* a, std::int64_t lda, const T* b, std::int64_t ldb, T beta, T* c, std::int64_t ldc, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + left_right + Specifies whether ``A`` is on the left side of the + multiplication (``side::left``) or on the right side + (``side::right``). See :ref:`onemkl_datatypes` for more details. + + + + uplo + Specifies whether ``A``'s data is stored in its upper or lower + triangle. See :ref:`onemkl_datatypes` for more details. + + + + m + Specifies the number of rows of the matrix ``B`` and ``C``. + + + The value of ``m`` must be at least zero. + + n + Specifies the number of columns of the matrix ``B`` and ``C``. + + + The value of ``n`` must be at least zero. + + + alpha + Scaling factor for the matrix-matrix product. + + + a + Pointer to input matrix ``A``. Must have size at least + ``lda``\ \*\ ``m`` if ``A`` is on the left of the + multiplication, or ``lda``\ \*\ ``n`` if ``A`` is on the right. + See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + lda + Leading dimension of ``A``. Must be at least ``m`` if ``A`` is + on the left of the multiplication, or at least ``n`` if ``A`` + is on the right. Must be positive. + + + b + Pointer to input matrix ``B``. Must have size at least + ``ldb``\ \*\ ``n``. See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + ldb + Leading dimension of ``B``. Must be positive and at least + ``m``. + + + beta + Scaling factor for matrix ``C``. + + + c + Pointer to input/output matrix ``C``. Must have size at least + ``ldc``\ \*\ ``n``. See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + ldc + Leading dimension of ``C``. Must be positive and at least + ``m``. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + c + Pointer to the output matrix, overwritten by + ``alpha``\ \*\ ``A``\ \*\ ``B`` + ``beta``\ \*\ ``C`` + (``left_right`` = ``side::left``) or + ``alpha``\ \*\ ``B``\ \*\ ``A`` + ``beta``\ \*\ ``C`` + (``left_right`` = ``side::right``). + + + .. container:: section + + + .. rubric:: Notes + :class: sectiontitle + + + If ``beta`` = 0, matrix ``C`` does not need to be initialized + before calling ``hemm``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-3-routines` diff --git a/docs/domains/blas/hemv.rst b/docs/domains/blas/hemv.rst index 289cdc0a8..b701d91af 100644 --- a/docs/domains/blas/hemv.rst +++ b/docs/domains/blas/hemv.rst @@ -1,4 +1,4 @@ -.. _hemv: +.. _onemkl_blas_hemv: hemv ==== @@ -10,16 +10,6 @@ hemv Computes a matrix-vector product using a Hermitian matrix. - .. container:: section - :name: GUID-152B72DC-F67F-4D7D-96DA-67AE6AD41718 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void hemv(queue &exec_queue, uplo upper_lower, std::int64_t n, T alpha, buffer &a, std::int64_t lda, buffer &x, std::int64_t incx, T beta, buffer &y, std::int64_t incy) ``hemv`` supports the following precisions. @@ -35,22 +25,17 @@ hemv .. container:: section - :name: GUID-0E4AE01A-4FE8-42AC-B236-409F4DD48F88 .. rubric:: Description - :name: description :class: sectiontitle - The hemv routines compute a scalar-matrix-vector product and add the + The ``hemv`` routines compute a scalar-matrix-vector product and add the result to a scalar-vector product, with a Hermitian matrix. The operation is defined as - - - y <- alpha*A*x + beta*y @@ -66,23 +51,33 @@ hemv ``x`` and ``y`` are vectors of length ``n``. +hemv (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::hemv(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &x, std::int64_t incx, T beta, sycl::buffer &y, std::int64_t incy) .. container:: section - :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. upper_lower - Specifies whether *A* is upper or lower triangular. See - :ref:`onemkl_datatypes` for more - details. + Specifies whether *A* is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. + n @@ -132,11 +127,9 @@ hemv .. container:: section - :name: GUID-66566E59-9A52-4207-B123-AF45FA3A0FBC .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -144,15 +137,117 @@ hemv Buffer holding the updated vector ``y``. -.. container:: familylinks +hemv (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-2-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::hemv(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, const T *a, std::int64_t lda, const T *x, std::int64_t incx, T beta, T *y, std::int64_t incy, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + upper_lower + Specifies whether *A* is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. + + + + n + Number of rows and columns of ``A``. Must be at least zero. + + alpha + Scaling factor for the matrix-vector product. + + + a + Pointer to input matrix ``A``. The array holding input matrix + ``A`` must have size at least ``lda``\ \*\ ``n``. See `Matrix + and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + lda + Leading dimension of matrix ``A``. Must be at least ``m``, and + positive. + + + x + Pointer to input vector ``x``. The array holding input vector + ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)). + See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + beta + Scaling factor for vector ``y``. + + + y + Pointer to input/output vector ``y``. The array holding + input/output vector ``y`` must be of size at least (1 + (``n`` + - 1)*abs(``incy``)). See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incy + Stride of vector ``y``. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + y + Pointer to the updated vector ``y``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-2-routines` diff --git a/docs/domains/blas/her.rst b/docs/domains/blas/her.rst index a13196fb6..5fc0f438d 100644 --- a/docs/domains/blas/her.rst +++ b/docs/domains/blas/her.rst @@ -1,4 +1,4 @@ -.. _her: +.. _onemkl_blas_her: her === @@ -10,16 +10,6 @@ her Computes a rank-1 update of a Hermitian matrix. - .. container:: section - :name: GUID-252B1D4A-30C7-4678-9793-6A0C90DEB04A - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void her(queue &exec_queue, uplo upper_lower, std::int64_t n, T alpha, buffer &x, std::int64_t incx, buffer &a, std::int64_t lda) ``her`` supports the following precisions. @@ -35,21 +25,16 @@ her .. container:: section - :name: GUID-A06B7C00-CFD6-4A01-9739-19093823B58E .. rubric:: Description - :name: description :class: sectiontitle - The her routines compute a scalar-vector-vector product and add the + The ``her`` routines compute a scalar-vector-vector product and add the result to a Hermitian matrix. The operation is defined as - - - A <- alpha*x*x :sup:`H` + A @@ -65,23 +50,33 @@ her ``x`` is a vector of length ``n``. +her (Buffer Version) +-------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::her(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &a, std::int64_t lda) .. container:: section - :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. upper_lower - Specifies whether *A* is upper or lower triangular. See - :ref:`onemkl_datatypes` for more - details. + Specifies whether *A* is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. + n @@ -116,33 +111,123 @@ her .. container:: section - :name: GUID-89A60481-0763-4608-B346-3CC746467F28 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle a - Buffer holding the updated upper triangular part of theHermitian + Buffer holding the updated upper triangular part of the Hermitian matrix ``A`` if ``upper_lower = upper`` or the updated - lowertriangular part of the Hermitian matrix ``A`` if + lower triangular part of the Hermitian matrix ``A`` if ``upper_lower = lower``. - The imaginary parts of the diagonal elementsare set to zero. + The imaginary parts of the diagonal elements are set to zero. -.. container:: familylinks +her (USM Version) +----------------- +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-2-routines` - + .. rubric:: Syntax + :class: sectiontitle + + + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::her(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, const T *x, std::int64_t incx, T *a, std::int64_t lda, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + upper_lower + Specifies whether *A* is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. -.. container:: + n + Number of rows and columns of ``A``. Must be at least zero. + + + alpha + Scaling factor for the matrix-vector product. + + + x + Pointer to input vector ``x``. The array holding input vector + ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)). + See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + a + Pointer to input matrix ``A``. The array holding input matrix + ``A`` must have size at least ``lda``\ \*\ ``n``. See `Matrix + and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + lda + Leading dimension of matrix ``A``. Must be at least ``n``, and + positive. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + a + Pointer to the updated upper triangular part of the Hermitian + matrix ``A`` if ``upper_lower = upper`` or the updated + lower triangular part of the Hermitian matrix ``A`` if + ``upper_lower = lower``. + + + The imaginary parts of the diagonal elements are set to zero. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-2-routines` diff --git a/docs/domains/blas/her2.rst b/docs/domains/blas/her2.rst index 0c100195c..ae8762e87 100644 --- a/docs/domains/blas/her2.rst +++ b/docs/domains/blas/her2.rst @@ -1,4 +1,4 @@ -.. _her2: +.. _onemkl_blas_her2: her2 ==== @@ -10,16 +10,6 @@ her2 Computes a rank-2 update of a Hermitian matrix. - .. container:: section - :name: GUID-4BED3537-E900-4260-A6EB-2F42CB1D3AFB - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void her2(queue &exec_queue, uplo upper_lower, std::int64_t n, T alpha, buffer &x, std::int64_t incx, buffer &y, std::int64_t incy, buffer &a, std::int64_t lda) ``her2`` supports the following precisions. @@ -35,21 +25,16 @@ her2 .. container:: section - :name: GUID-2B939041-9BCC-4AE8-A31D-2CFCA67B9B6A .. rubric:: Description - :name: description :class: sectiontitle - The her2 routines compute two scalar-vector-vector products and add + The ``her2`` routines compute two scalar-vector-vector products and add them to a Hermitian matrix. The operation is defined as - - - A <- alpha*x*y :sup:`H` + conjg(alpha)*y*x :sup:`H` + A @@ -65,23 +50,33 @@ her2 ``x`` and ``y`` are vectors or length ``n``. +her2 (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::her2(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, std::int64_t incy, sycl::buffer &a, std::int64_t lda) .. container:: section - :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. upper_lower - Specifies whether *A* is upper or lower triangular. See - :ref:`onemkl_datatypes` for more - details. + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. + n @@ -127,33 +122,135 @@ her2 .. container:: section - :name: GUID-34B3837B-4980-458B-AC3A-EEE5F635834C .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle a - Buffer holding the updated upper triangular part of theHermitian + Buffer holding the updated upper triangular part of the Hermitian matrix ``A`` if ``upper_lower = upper``, or the updated - lowertriangular part of the Hermitian matrix ``A`` if + lower triangular part of the Hermitian matrix ``A`` if ``upper_lower = lower``. - The imaginary parts of the diagonal elementsare set to zero. + The imaginary parts of the diagonal elements are set to zero. -.. container:: familylinks +her2 (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-2-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::her2(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, const T *x, std::int64_t incx, const T *y, std::int64_t incy, T *a, std::int64_t lda, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + upper_lower + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. + + + + n + Number of columns of ``A``. Must be at least zero. + + alpha + Scaling factor for the matrix-vector product. + + + x + Pointer to input vector ``x``. The array holding input vector + ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)). + See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + y + Pointer to input/output vector ``y``. The array holding + input/output vector ``y`` must be of size at least (1 + (``n`` + - 1)*abs(``incy``)). See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incy + Stride of vector ``y``. + + + a + Pointer to input matrix ``A``. The array holding input matrix + ``A`` must have size at least ``lda``\ \*\ ``n``. See `Matrix + and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + lda + Leading dimension of matrix ``A``. Must be at least ``n``, and + positive. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + a + Pointer to the updated upper triangular part of the Hermitian + matrix ``A`` if ``upper_lower = upper``, or the updated + lower triangular part of the Hermitian matrix ``A`` if + ``upper_lower = lower``. + + + The imaginary parts of the diagonal elements are set to zero. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-2-routines` diff --git a/docs/domains/blas/her2k.rst b/docs/domains/blas/her2k.rst index 98e9cac4f..1a510145f 100644 --- a/docs/domains/blas/her2k.rst +++ b/docs/domains/blas/her2k.rst @@ -1,4 +1,4 @@ -.. _her2k: +.. _onemkl_blas_her2k: her2k ===== @@ -10,18 +10,8 @@ her2k Performs a Hermitian rank-2k update. - .. container:: section - :name: GUID-1839F1B0-EFE0-40A4-901E-53E7F9B395C2 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - .. cpp:function:: void her2k(queue &exec_queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, T alpha, buffer &a, std::int64_t lda, buffer &b, std::int64_t ldb, T_real beta, buffer &c, std::int64_t ldc) - - her2k supports the following precisions: + ``her2k`` supports the following precisions: .. list-table:: @@ -38,22 +28,17 @@ her2k .. container:: section - :name: GUID-6DDD93FE-028E-400C-BBD0-CA13132FAC35 .. rubric:: Description - :name: description :class: sectiontitle - The her2k routines perform a rank-2k update of an ``n`` x ``n`` + The ``her2k`` routines perform a rank-2k update of an ``n`` x ``n`` Hermitian matrix ``C`` by general matrices ``A`` and ``B``. If ``trans`` = ``transpose::nontrans``. The operation is defined as - - - C <- alpha*A*B :sup:`H` + conjg(alpha)*B*A :sup:`H` + beta*C @@ -63,9 +48,6 @@ her2k If ``trans`` = ``transpose::conjtrans``, the operation is defined as: - - - C <- alpha*B*A :sup:`H` + conjg(alpha)*A*B :sup:`H` + beta*C @@ -84,24 +66,34 @@ her2k The inner dimension of both matrix multiplications is ``k``. +her2k (Buffer Version) +---------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, T alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &b, std::int64_t ldb, T_real beta, sycl::buffer &c, std::int64_t ldc) .. container:: section - :name: GUID-54538396-B04D-4A2A-8A7D-E503A6F815AD .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. upper_lower Specifies whether ``A``'s data is stored in its upper or lower - triangle. See - :ref:`onemkl_datatypes` for more - details. + triangle. See :ref:`onemkl_datatypes` for more details. + trans @@ -172,11 +164,9 @@ her2k .. container:: section - :name: GUID-48D39D42-B29F-4428-A588-9058570B5D5E .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -184,15 +174,140 @@ her2k Output buffer, overwritten by the updated ``C`` matrix. -.. container:: familylinks +her2k (USM Version) +------------------- +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-3-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, T alpha, const T* a, std::int64_t lda, const T* b, std::int64_t ldb, T_real beta, T* c, std::int64_t ldc, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + upper_lower + Specifies whether ``A``'s data is stored in its upper or lower + triangle. See :ref:`onemkl_datatypes` for more details. + + + + trans + Specifies the operation to apply, as described above. Supported + operations are ``transpose::nontrans`` and + ``transpose::conjtrans``. + + + n + The number of rows and columns in ``C``. The value of ``n`` + must be at least zero. + + k + The inner dimension of matrix multiplications. The value of + ``k`` must be at least equal to zero. + + + alpha + Complex scaling factor for the rank-2\ ``k`` update. + + + a + Pointer to input matrix ``A``. If ``trans`` = + ``transpose::nontrans``, ``A`` is an ``n``-by-``k`` matrix so + the array ``a`` must have size at least ``lda``\ \*\ ``k``. + Otherwise, ``A`` is an ``k``-by-``n`` matrix so the array ``a`` + must have size at least ``lda``\ \*\ ``n``. See `Matrix and + Vector + Storage <../matrix-storage.html>`__ for + more details. + + + lda + Leading dimension of ``A``. Must be at least ``n`` if ``trans`` + = ``transpose::nontrans``, and at least ``k`` otherwise. Must + be positive. + + + beta + Real scaling factor for matrix ``C``. + + + b + Pointer to input matrix ``B``. If ``trans`` = + ``transpose::nontrans``, ``B`` is an ``k``-by-``n`` matrix so + the array ``b`` must have size at least ``ldb``\ \*\ ``n``. + Otherwise, ``B`` is an ``n``-by-``k`` matrix so the array ``b`` + must have size at least ``ldb``\ \*\ ``k``. See `Matrix and + Vector + Storage <../matrix-storage.html>`__ for + more details. + + + ldb + Leading dimension of ``B``. Must be at least ``k`` if ``trans`` + = ``transpose::nontrans``, and at least ``n`` otherwise. Must + be positive. + + + c + Pointer to input/output matrix ``C``. Must have size at least + ``ldc``\ \*\ ``n``. See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + ldc + Leading dimension of ``C``. Must be positive and at least + ``n``. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + c + Pointer to the output matrix, overwritten by the updated ``C`` + matrix. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-3-routines` diff --git a/docs/domains/blas/herk.rst b/docs/domains/blas/herk.rst index 03b510a1f..1b1e806c3 100644 --- a/docs/domains/blas/herk.rst +++ b/docs/domains/blas/herk.rst @@ -1,4 +1,4 @@ -.. _herk: +.. _onemkl_blas_herk: herk ==== @@ -10,18 +10,8 @@ herk Performs a Hermitian rank-k update. - .. container:: section - :name: GUID-407B8203-A28D-468B-BA79-87FA865E75A2 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - .. cpp:function:: void herk(queue &exec_queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, T_real alpha, buffer &a, std::int64_t lda, T_real beta, buffer &c, std::int64_t ldc) - - herk supports the following precisions: + ``herk`` supports the following precisions: .. list-table:: @@ -38,19 +28,14 @@ herk .. container:: section - :name: GUID-539B4E63-9CDF-4834-999A-4133CE5DE1E5 .. rubric:: Description - :name: description :class: sectiontitle - The herk routines compute a rank-``k`` update of a Hermitian matrix - *C* by a general matrix ``A``. The operation is defined as: - - - + The ``herk`` routines compute a rank-``k`` update of a Hermitian matrix + ``C`` by a general matrix ``A``. The operation is defined as: C <- alpha*op(A)*op(A) :sup:`H` + beta*C @@ -71,29 +56,38 @@ herk Here op(``A``) is ``n`` x ``k``, and ``C`` is ``n`` x ``n``. +herk (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, T_real alpha, sycl::buffer &a, std::int64_t lda, T_real beta, sycl::buffer &c, std::int64_t ldc) + .. container:: section - :name: GUID-7B880A06-4E53-4DE9-B0E6-D70673CF2638 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. upper_lower Specifies whether ``A``'s data is stored in its upper or lower - triangle. See - :ref:`onemkl_datatypes` for more - details. + triangle. See :ref:`onemkl_datatypes` for more details. trans - Specifies op(``A``), the transposition operation applied to ``A``. - See + Specifies op(``A``), the transposition operation applied to ``A``. See :ref:`onemkl_datatypes` for more details. Supported operations are ``transpose::nontrans`` and ``transpose::conjtrans``. @@ -147,11 +141,9 @@ herk .. container:: section - :name: GUID-05309970-DEC8-4D87-90AA-958FC101E119 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -161,15 +153,127 @@ herk The imaginary parts of the diagonal elements are set to zero. -.. container:: familylinks +herk (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-3-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, T_real alpha, const T* a, std::int64_t lda, T_real beta, T* c, std::int64_t ldc, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + upper_lower + Specifies whether ``A``'s data is stored in its upper or lower + triangle. See :ref:`onemkl_datatypes` for more details. + + + + trans + Specifies op(``A``), the transposition operation applied to + ``A``. See :ref:`onemkl_datatypes` for more details. Supported operations are ``transpose::nontrans`` + and ``transpose::conjtrans``. + + n + The number of rows and columns in ``C``.The value of ``n`` must + be at least zero. + + + k + Number of columns in op(``A``). + + + The value of ``k`` must be at least zero. + + + alpha + Real scaling factor for the rank-``k`` update. + + + a + Pointer to input matrix ``A``. If ``trans`` = + ``transpose::nontrans``, ``A`` is an ``n``-by-``k`` matrix so + the array ``a`` must have size at least ``lda``\ \*\ ``k``. + Otherwise, ``A`` is an ``k``-by-``n`` matrix so the array ``a`` + must have size at least ``lda``\ \*\ ``n``. See `Matrix and + Vector + Storage <../matrix-storage.html>`__ for + more details. + + + lda + Leading dimension of ``A``. Must be at least ``n`` if ``A`` is + not transposed, and at least ``k`` if ``A`` is transposed. Must + be positive. + + + beta + Real scaling factor for matrix ``C``. + + + c + Pointer to input/output matrix ``C``. Must have size at least + ``ldc``\ \*\ ``n``. See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + ldc + Leading dimension of ``C``. Must be positive and at least + ``n``. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + c + Pointer to the output matrix, overwritten by + ``alpha``\ \*op(``A``)*op(``A``)\ :sup:`T` + + ``beta``\ \*\ ``C``. The imaginary parts of the diagonal + elements are set to zero. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-3-routines` diff --git a/docs/domains/blas/hpmv.rst b/docs/domains/blas/hpmv.rst index f71e4b392..f591a36fe 100644 --- a/docs/domains/blas/hpmv.rst +++ b/docs/domains/blas/hpmv.rst @@ -1,4 +1,4 @@ -.. _hpmv: +.. _onemkl_blas_hpmv: hpmv ==== @@ -10,16 +10,6 @@ hpmv Computes a matrix-vector product using a Hermitian packed matrix. - .. container:: section - :name: GUID-C6E4A4A7-5CBE-46ED-A021-8FEAABAA2E93 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void hpmv(queue &exec_queue, uplo upper_lower, std::int64_t n, T alpha, buffer &a, buffer &x, std::int64_t incx, T beta, buffer &y, std::int64_t incy) ``hpmv`` supports the following precisions. @@ -35,22 +25,17 @@ hpmv .. container:: section - :name: GUID-A95C32C5-0371-429B-847C-4EE29FD9C480 .. rubric:: Description - :name: description :class: sectiontitle - The hpmv routines compute a scalar-matrix-vector product and add the + The ``hpmv`` routines compute a scalar-matrix-vector product and add the result to a scalar-vector product, with a Hermitian packed matrix. The operation is defined as - - - y <- alpha*A*x + beta*y @@ -66,23 +51,34 @@ hpmv ``x`` and ``y`` are vectors of length ``n``. +hpmv (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::hpmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, sycl::buffer &a, sycl::buffer &x, std::int64_t incx, T beta, sycl::buffer &y, std::int64_t incy) + .. container:: section - :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. upper_lower - Specifies whether *A* is upper or lower triangular. See - :ref:`onemkl_datatypes` for more - details. + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. + n @@ -131,11 +127,9 @@ hpmv .. container:: section - :name: GUID-416B82CD-C5B8-472A-8347-04997EA6D6E6 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -143,15 +137,116 @@ hpmv Buffer holding the updated vector ``y``. -.. container:: familylinks +hpmv (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-2-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::hpmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, const T *a, const T *x, std::int64_t incx, T beta, T *y, std::int64_t incy, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + upper_lower + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. + + + + n + Number of rows and columns of ``A``. Must be at least zero. + + alpha + Scaling factor for the matrix-vector product. + + + a + Pointer to input matrix ``A``. The array holding input matrix + ``A`` must have size at least (``n``\ \*(``n``\ +1))/2. See + `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + The imaginary parts of the diagonal elements need not be set + and are assumed to be zero. + + + x + Pointer to input vector ``x``. The array holding input vector + ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)). + See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + beta + Scaling factor for vector ``y``. + + + y + Pointer to input/output vector ``y``. The array holding + input/output vector ``y`` must be of size at least (1 + (``n`` + - 1)*abs(``incy``)). See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incy + Stride of vector ``y``. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + y + Pointer to the updated vector ``y``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-2-routines` diff --git a/docs/domains/blas/hpr.rst b/docs/domains/blas/hpr.rst index 1c0f38a0c..f4cb47c0b 100644 --- a/docs/domains/blas/hpr.rst +++ b/docs/domains/blas/hpr.rst @@ -1,4 +1,4 @@ -.. _hpr: +.. _onemkl_blas_hpr: hpr === @@ -10,16 +10,6 @@ hpr Computes a rank-1 update of a Hermitian packed matrix. - .. container:: section - :name: GUID-61DC4DBA-9357-4129-B8A3-931E2E7335D4 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void hpr(queue &exec_queue, uplo upper_lower, std::int64_t n, T alpha, buffer &x, std::int64_t incx, buffer &a) ``hpr`` supports the following precisions. @@ -32,24 +22,17 @@ hpr * - ``std::complex`` - - .. container:: section - :name: GUID-02B8128C-02CE-4D5C-BE5D-DFD088C90475 .. rubric:: Description - :name: description :class: sectiontitle - The hpr routines compute a scalar-vector-vector product and add the + The ``hpr`` routines compute a scalar-vector-vector product and add the result to a Hermitian packed matrix. The operation is defined as - - - A <- alpha*x*x :sup:`H` + A @@ -65,23 +48,33 @@ hpr ``x`` is a vector of length ``n``. +hpr (Buffer Version) +-------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::hpr(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &a) + .. container:: section - :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. upper_lower - Specifies whether ``A`` is upper or lower triangular. See - :ref:`onemkl_datatypes` for more - details. + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. n @@ -111,37 +104,125 @@ hpr The imaginary part of the diagonal elements need not be set and - are assumed to be zero + are assumed to be zero. .. container:: section - :name: GUID-7261182A-450B-46F5-8C61-7133597D3530 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle a - Buffer holding the updated upper triangularpart of the Hermitian + Buffer holding the updated upper triangular part of the Hermitian matrix ``A`` if ``upper_lower =upper``, or the updated lower - triangular part of theHermitian matrix ``A`` if + triangular part of the Hermitian matrix ``A`` if ``upper_lower =lower``. - The imaginary parts of the diagonal elements are set tozero. + The imaginary parts of the diagonal elements are set to zero. -.. container:: familylinks +hpr (USM Version) +----------------- +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-2-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::hpr(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, const T *x, std::int64_t incx, T *a, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + upper_lower + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. + + n + Number of rows and columns of ``A``. Must be at least zero. + + + alpha + Scaling factor for the matrix-vector product. + + + x + Pointer to input vector ``x``. The array holding input vector + ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)). + See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + a + Pointer to input matrix ``A``. The array holding input matrix + ``A`` must have size at least (``n``\ \*(``n``-1))/2. See + `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + The imaginary part of the diagonal elements need not be set and + are assumed to be zero. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + a + Pointer to the updated upper triangular part of the Hermitian + matrix ``A`` if ``upper_lower =upper``, or the updated lower + triangular part of the Hermitian matrix ``A`` if + ``upper_lower =lower``. + + + The imaginary parts of the diagonal elements are set to zero. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-2-routines` diff --git a/docs/domains/blas/hpr2.rst b/docs/domains/blas/hpr2.rst index bfe83d4b6..4537428cb 100644 --- a/docs/domains/blas/hpr2.rst +++ b/docs/domains/blas/hpr2.rst @@ -1,4 +1,4 @@ -.. _hpr2: +.. _onemkl_blas_hpr2: hpr2 ==== @@ -10,16 +10,6 @@ hpr2 Performs a rank-2 update of a Hermitian packed matrix. - .. container:: section - :name: GUID-9F8EB534-6520-4470-85AC-6AD8F2467AD4 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void hpr2(queue &exec_queue, uplo upper_lower, std::int64_t n, T alpha, buffer &x, std::int64_t incx, buffer &y, std::int64_t incy, buffer &a) ``hpr2`` supports the following precisions. @@ -35,21 +25,16 @@ hpr2 .. container:: section - :name: GUID-16FE1EDC-1A72-4BAB-8AFF-C316C4CE5838 .. rubric:: Description - :name: description :class: sectiontitle - The hpr2 routines compute two scalar-vector-vector products and add + The ``hpr2`` routines compute two scalar-vector-vector products and add them to a Hermitian packed matrix. The operation is defined as - - - A <- alpha*x*y :sup:`H` + conjg(alpha)*y*x :sup:`H` + A @@ -65,23 +50,33 @@ hpr2 ``x`` and ``y`` are vectors of length ``n``. +hpr2 (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::hpr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, std::int64_t incy, sycl::buffer &a) + .. container:: section - :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. upper_lower - Specifies whether *A* is upper or lower triangular. See - :ref:`onemkl_datatypes` for more - details. + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. n @@ -126,38 +121,133 @@ hpr2 .. container:: section - :name: GUID-9A77A2E0-F610-44EE-A3EE-81327B90A3FD .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle - **sycl:** -   - - - a - Buffer holding the updated upper triangularpart of the Hermitian + Buffer holding the updated upper triangular part of the Hermitian matrix ``A`` if ``upper_lower =upper``, or the updated lower - triangular part of theHermitian matrix ``A`` if + triangular part of the Hermitian matrix ``A`` if ``upper_lower =lower``. - The imaginary parts of the diagonal elements are set tozero. + The imaginary parts of the diagonal elements are set to zero. -.. container:: familylinks +hpr2 (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-2-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::hpr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, const T *x, std::int64_t incx, const T *y, std::int64_t incy, T *a, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + upper_lower + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. + + + n + Number of rows and columns of ``A``. Must be at least zero. + + alpha + Scaling factor for the matrix-vector product. + + + x + Pointer to input vector ``x``. The array holding input vector + ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)). + See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + y + Pointer to input/output vector ``y``. The array holding + input/output vector ``y`` must be of size at least (1 + (``n`` + - 1)*abs(``incy``)). See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incy + Stride of vector ``y``. + + + a + Pointer to input matrix ``A``. The array holding input matrix + ``A`` must have size at least (``n``\ \*(``n``-1))/2. See + `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + The imaginary parts of the diagonal elements need not be set + and are assumed to be zero. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + a + Pointer to the updated upper triangular part of the Hermitian + matrix ``A`` if ``upper_lower =upper``, or the updated lower + triangular part of the Hermitian matrix ``A`` if + ``upper_lower =lower``. + + + The imaginary parts of the diagonal elements are set to zero. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-2-routines` diff --git a/docs/domains/blas/iamax.rst b/docs/domains/blas/iamax.rst index 1678b7279..5ab2d436d 100644 --- a/docs/domains/blas/iamax.rst +++ b/docs/domains/blas/iamax.rst @@ -1,4 +1,4 @@ -.. _iamax: +.. _onemkl_blas_iamax: iamax ===== @@ -11,18 +11,8 @@ iamax vector. - .. container:: section - :name: GUID-D1ABF76D-DB39-4C23-A217-EA2C7C6D1325 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - .. cpp:function:: void iamax(queue &exec_queue, std::int64_t n, buffer &x, std::int64_t incx, buffer &result) - - iamax supports the following precisions. + ``iamax`` supports the following precisions. .. list-table:: @@ -38,15 +28,13 @@ iamax .. container:: section - :name: GUID-822D7950-256E-406D-9305-61F761080E69 .. rubric:: Description - :name: description :class: sectiontitle - The iamax routines return an index ``i``\ such that ``x``\ [``i``] + The ``iamax`` routines return an index ``i``\ such that ``x``\ [``i``] has the maximum absolute value of all elements in vector ``x`` (real variants), or such that ``|Re(x[i])| + |Im(x[i])|`` is maximal (complex variants). @@ -56,7 +44,6 @@ iamax .. rubric:: Note - :name: note :class: NoteTipHead @@ -75,16 +62,28 @@ iamax index of the first ``NaN``. +iamax (Buffer Version) +---------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::iamax(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, sycl::buffer &result) + .. container:: section - :name: GUID-CE43FE84-2066-4095-BB7E-0691CD045443 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. @@ -104,11 +103,9 @@ iamax .. container:: section - :name: ARGUMENTS_EC9F05BE9B09443F8BC59207D5EA40F1 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -117,16 +114,80 @@ iamax is stored. +iamax (USM Version) +------------------- -.. container:: familylinks +.. container:: + .. container:: section - .. container:: parentlink + .. rubric:: Syntax + :class: sectiontitle - **Parent topic:** :ref:`blas-level-1-routines` - + .. container:: dlsyntaxpara -.. container:: + .. cpp:function:: sycl::event onemkl::blas::iamax(sycl::queue &queue, std::int64_t n, const T *x, std::int64_t incx, T_res *result, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + n + The number of elements in vector ``x``. + + + x + The pointer to the input vector ``x``. The array holding the + input vector ``x`` must be of size at least (1 + (``n`` - + 1)*abs(``incx``)). See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + The stride of vector ``x``. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + result + The pointer to where the zero-based index ``i`` of the maximal + element is stored. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-1-routines` diff --git a/docs/domains/blas/iamin.rst b/docs/domains/blas/iamin.rst index ca5ea696f..509b575ef 100644 --- a/docs/domains/blas/iamin.rst +++ b/docs/domains/blas/iamin.rst @@ -1,4 +1,4 @@ -.. _iamin: +.. _onemkl_blas_iamin: iamin ===== @@ -10,16 +10,6 @@ iamin Finds the index of the element with the smallest absolute value. - .. container:: section - :name: GUID-5D077B60-17B5-4961-AFF7-20D78BFB2A07 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void iamin(queue &exec_queue, std::int64_t n, buffer &x, std::int64_t incx, buffer &result) ``iamin`` supports the following precisions. @@ -37,15 +27,13 @@ iamin .. container:: section - :name: GUID-A820CE7B-E983-4D8F-A73A-753FD95BD507 .. rubric:: Description - :name: description :class: sectiontitle - The iamin routines return an index ``i`` such that ``x``\ [``i``] has + The ``iamin`` routines return an index ``i`` such that ``x``\ [``i``] has the minimum absolute value of all elements in vector ``x`` (real variants), or such that \|Re(``x``\ [``i``])\| + \|Im(``x``\ [``i``])\| is maximal (complex variants). @@ -55,7 +43,6 @@ iamin .. rubric:: Note - :name: note :class: NoteTipHead @@ -74,16 +61,28 @@ iamin index of the first ``NaN``. +iamin (Buffer Version) +---------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::iamin(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, sycl::buffer &result) + .. container:: section - :name: GUID-A615800D-734E-4997-BB91-1C76AEEE9EC2 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. @@ -103,11 +102,9 @@ iamin .. container:: section - :name: GUID-2B160DEB-ADBB-4044-8078-4B613A0DA4E1 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -116,15 +113,75 @@ iamin will be stored. -.. container:: familylinks +iamin (USM Version) +------------------- +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-1-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::iamin(sycl::queue &queue, std::int64_t n, const T *x, std::int64_t incx, T_res *result, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + queue + The queue where the routine should be executed. + + + n + Number of elements in vector ``x``. + + + x + The pointer to input vector ``x``. The array holding input + vector ``x`` must be of size at least (1 + (``n`` - + 1)*abs(``incx``)). See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector x. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + result + Pointer to where the zero-based index ``i`` of the minimum + element will be stored. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-1-routines` diff --git a/docs/domains/blas/nrm2.rst b/docs/domains/blas/nrm2.rst index dfbf2265c..55d8fcccd 100644 --- a/docs/domains/blas/nrm2.rst +++ b/docs/domains/blas/nrm2.rst @@ -1,4 +1,4 @@ -.. _nrm2: +.. _onemkl_blas_nrm2: nrm2 ==== @@ -10,16 +10,6 @@ nrm2 Computes the Euclidean norm of a vector. - .. container:: section - :name: GUID-F55A15D5-CCDA-4C44-B86F-C9A5FB36725E - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void nrm2(queue &exec_queue, std::int64_t n, buffer &x, std::int64_t incx, buffer &result) ``nrm2`` supports the following precisions. @@ -42,19 +32,13 @@ nrm2 .. container:: section - :name: GUID-2BF2C965-5A8C-47F1-9C73-FB0E485CE32A .. rubric:: Description - :name: description :class: sectiontitle - The nrm2 routines computes Euclidean norm of a vector - - - - + The ``nrm2`` routines computes Euclidean norm of a vector result = ||x||, @@ -65,16 +49,27 @@ nrm2 ``x`` is a vector of ``n`` elements. +nrm2 (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, sycl::buffer &result) .. container:: section - :name: GUID-A615800D-734E-4997-BB91-1C76AEEE9EC2 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. @@ -90,15 +85,13 @@ nrm2 incx - Stride of vector x. + Stride of vector ``x``. .. container:: section - :name: GUID-2B160DEB-ADBB-4044-8078-4B613A0DA4E1 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -107,15 +100,80 @@ nrm2 stored. -.. container:: familylinks +nrm2 (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-1-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::nrm2(sycl::queue &queue, std::int64_t n, const T *x, std::int64_t incx, T_res *result, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + queue + The queue where the routine should be executed. + + + n + Number of elements in vector ``x``. + + + x + Pointer to input vector ``x``. The array holding input vector + ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)). + See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + result + Pointer to where the Euclidean norm of the vector ``x`` will be + stored. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-1-routines` diff --git a/docs/domains/blas/rot.rst b/docs/domains/blas/rot.rst index 2dc20bdce..c7534ee3f 100644 --- a/docs/domains/blas/rot.rst +++ b/docs/domains/blas/rot.rst @@ -1,4 +1,4 @@ -.. _rot: +.. _onemkl_blas_rot: rot === @@ -10,16 +10,6 @@ rot Performs rotation of points in the plane. - .. container:: section - :name: GUID-9DD44991-6A55-49EE-BD0C-F13406FFBE52 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void rot(queue &exec_queue, std::int64_t n, buffer &x, std::int64_t incx, buffer &y, std::int64_t incy, T_scalar c, T_scalar s) ``rot`` supports the following precisions. @@ -42,15 +32,13 @@ rot .. container:: section - :name: GUID-8B7F46D1-5047-4D4C-AF66-F0A3E4AC2BA5 .. rubric:: Description - :name: description :class: sectiontitle - Given two vectors ``x`` and ``y`` of ``n`` elements, the rot routines + Given two vectors ``x`` and ``y`` of ``n`` elements, the ``rot`` routines compute four scalar-vector products and update the input vectors with the sum of two of these scalar-vector products as follow: @@ -61,16 +49,28 @@ rot +rot (Buffer Version) +-------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::rot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, std::int64_t incy, T_scalar c, T_scalar s) + .. container:: section - :name: GUID-A615800D-734E-4997-BB91-1C76AEEE9EC2 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. @@ -86,7 +86,7 @@ rot incx - Stride of vector x. + Stride of vector ``x``. y @@ -97,7 +97,7 @@ rot incy - Stride of vector y. + Stride of vector ``y``. c @@ -109,11 +109,9 @@ rot .. container:: section - :name: GUID-2B160DEB-ADBB-4044-8078-4B613A0DA4E1 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -125,15 +123,103 @@ rot Buffer holding updated buffer ``y``. -.. container:: familylinks +rot (USM Version) +----------------- +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-1-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::rot(sycl::queue &queue, std::int64_t n, T *x, std::int64_t incx, T *y, std::int64_t incy, T_scalar c, T_scalar s, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + n + Number of elements in vector ``x``. + + + x + Pointer to input vector ``x``. The array holding input vector + ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)). + See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + incx + Stride of vector ``x``. + + + y + Pointer to input vector ``y``. The array holding input vector + ``y`` must be of size at least (1 + (``n`` - 1)*abs(``incy``)). + See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incy + Stride of vector ``y``. + + + c + Scaling factor. + + + s + Scaling factor. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + x + Pointer to the updated matrix ``x``. + + + y + Pointer to the updated matrix ``y``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-1-routines` diff --git a/docs/domains/blas/rotg.rst b/docs/domains/blas/rotg.rst index 3110a60a8..df3f8396c 100644 --- a/docs/domains/blas/rotg.rst +++ b/docs/domains/blas/rotg.rst @@ -1,4 +1,4 @@ -.. _rotg: +.. _onemkl_blas_rotg: rotg ==== @@ -10,16 +10,6 @@ rotg Computes the parameters for a Givens rotation. - .. container:: section - :name: GUID-E4B6E693-AC8C-4BB3-A197-3EB9E905B925 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void rotg(queue &exec_queue, buffer &a, buffer &b, buffer &c, buffer &s) ``rotg`` supports the following precisions. @@ -42,15 +32,13 @@ rotg .. container:: section - :name: GUID-5614B81D-C736-4714-88AB-29B38F9B3589 .. rubric:: Description - :name: description :class: sectiontitle - Given the Cartesian coordinates ``(a, b)`` of a point, the rotg + Given the Cartesian coordinates ``(a, b)`` of a point, the ``rotg`` routines return the parameters ``c``, ``s``, ``r``, and ``z`` associated with the Givens rotation. The parameters ``c`` and ``s`` define a unitary matrix such that: @@ -61,16 +49,28 @@ rotg 1/``c``; otherwise ``z`` is 1. +rotg (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::rotg(sycl::queue &queue, sycl::buffer &a, sycl::buffer &b, sycl::buffer &c, sycl::buffer &s) + .. container:: section - :name: GUID-C2003328-15AA-4DF0-A417-40BECCA7DEA3 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed @@ -83,11 +83,9 @@ rotg .. container:: section - :name: GUID-3B7937E3-2DF7-49A3-8F1E-2C9406BB4E88 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -111,15 +109,87 @@ rotg rotation. -.. container:: familylinks +rotg (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-1-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::rotg(sycl::queue &queue, T *a, T *b, T_real *c, T *s, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed + + a + Pointer to the ``x``-coordinate of the point. + + + b + Pointer to the ``y``-coordinate of the point. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + a + Pointer to the parameter ``r`` associated with the Givens + rotation. + + + b + Pointer to the parameter ``z`` associated with the Givens + rotation. + + + c + Pointer to the parameter ``c`` associated with the Givens + rotation. + + + s + Pointer to the parameter ``s`` associated with the Givens + rotation. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-1-routines` diff --git a/docs/domains/blas/rotm.rst b/docs/domains/blas/rotm.rst index 4d025c4ea..237abb96c 100644 --- a/docs/domains/blas/rotm.rst +++ b/docs/domains/blas/rotm.rst @@ -1,4 +1,4 @@ -.. _rotm: +.. _onemkl_blas_rotm: rotm ==== @@ -10,16 +10,6 @@ rotm Performs modified Givens rotation of points in the plane. - .. container:: section - :name: GUID-F8F2E2EB-1704-454D-BE45-C055D6F4E7D6 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void rotm(queue &exec_queue, std::int64_t n, buffer &x, std::int64_t incx, buffer &y, std::int64_t incy, buffer ¶m) ``rotm`` supports the following precisions. @@ -35,11 +25,9 @@ rotm .. container:: section - :name: GUID-856650C6-2998-4452-A34A-DF6CB801087D .. rubric:: Description - :name: description :class: sectiontitle @@ -55,16 +43,28 @@ rotm transformation matrix. +rotm (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::rotm(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, std::int64_t incy, sycl::buffer ¶m) + .. container:: section - :name: GUID-A615800D-734E-4997-BB91-1C76AEEE9EC2 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. @@ -80,7 +80,7 @@ rotm incx - Stride of vector x. + Stride of vector ``x``. y @@ -91,7 +91,7 @@ rotm incy - Stride of vector y. + Stride of vector ``y``. param @@ -102,12 +102,12 @@ rotm ``param``\ [0] contains a switch, ``flag``, - ``param``\ [1-4] contain *h\ 11*,\ *h\ 21*, *h\ 12*,\ *h\ 22* - respectively, the components ofthe modified Givens transformation + ``param``\ [1-4] contain *h\ 11*, \ *h\ 21*, *h\ 12*, and \ *h\ 22* + respectively, the components of the modified Givens transformation matrix ``H``. - Depending on the values of ``flag``, thecomponents of ``H`` are + Depending on the values of ``flag``, the components of ``H`` are set as follows: @@ -133,11 +133,9 @@ rotm .. container:: section - :name: GUID-062D805B-68FF-41F6-8D9A-329C92A77EA3 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -149,22 +147,139 @@ rotm Buffer holding updated buffer ``y``. -.. container:: familylinks +rotm (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-1-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::rotm(sycl::queue &queue, std::int64_t n, T *x, std::int64_t incx, T *y, std::int64_t incy, T *param, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + n + Number of elements in vector ``x``. + + + x + Pointer to the input vector ``x``. The array holding the vector + ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)). + See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + yparam + Pointer to the input vector ``y``. The array holding the vector + ``y`` must be of size at least (1 + (``n`` - 1)*abs(``incy``)). + See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incy + Stride of vector ``y``. + + param + Pointer to an array of size 5. The elements of the ``param`` + array are: -.. |image0| image:: ../equations/GUID-608D9BA6-827F-48DE-A01F-0EE5991F7ee1.png -.. |image1| image:: ../equations/GUID-608D9BA6-827F-48DE-A01F-0EE5991F7ee2.png -.. |image2| image:: ../equations/GUID-608D9BA6-827F-48DE-A01F-0EE5991F7ee3.png -.. |image3| image:: ../equations/GUID-608D9BA6-827F-48DE-A01F-0EE5991F7ee4.png -.. |image4| image:: ../equations/GUID-608D9BA6-827F-48DE-A01F-0EE5991F7ee5.png + + ``param``\ [0] contains a switch, ``flag``, + + + ``param``\ [1-4] contain *h\ 11*, \ *h\ 21*, *h\ 12*, and \ *h\ 22* + respectively, the components of the modified Givens + transformation matrix ``H``. + + + Depending on the values of ``flag``, the components of ``H`` are + set as follows: + + + | ``flag =``\ ``-1.0``: + | |image1| + + + | ``flag =``\ ``0.0``: + | |image2| + + + | ``flag =``\ ``1.0``: + | |image3| + + + | ``flag =``\ ``-2.0``: + | |image4| + + + In the last three cases, the matrix entries of 1.0, -1.0, 0.0 + are assumed based on the value of ``flag`` and are not required + to be set in the ``param`` vector. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + x + Pointer to the updated array ``x``. + + + y + Pointer to the updated array ``y``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-1-routines` +.. |image0| image:: ../equations/GUID-67FC4AB3-40CB-441F-BA9F-88BAAC78Cee1.png +.. |image1| image:: ../equations/GUID-67FC4AB3-40CB-441F-BA9F-88BAAC78Cee2.png +.. |image2| image:: ../equations/GUID-67FC4AB3-40CB-441F-BA9F-88BAAC78Cee3.png +.. |image3| image:: ../equations/GUID-67FC4AB3-40CB-441F-BA9F-88BAAC78Cee4.png +.. |image4| image:: ../equations/GUID-67FC4AB3-40CB-441F-BA9F-88BAAC78Cee5.png diff --git a/docs/domains/blas/rotmg.rst b/docs/domains/blas/rotmg.rst index 64d6543ea..e89e64cc4 100644 --- a/docs/domains/blas/rotmg.rst +++ b/docs/domains/blas/rotmg.rst @@ -1,4 +1,4 @@ -.. _rotmg: +.. _onemkl_blas_rotmg: rotmg ===== @@ -10,16 +10,6 @@ rotmg Computes the parameters for a modified Givens rotation. - .. container:: section - :name: GUID-DF41021D-C145-495B-A717-45FB5F36E676 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void rotmg(queue &exec_queue, buffer &d1, buffer &d2, buffer &x1, buffer &y1, buffer ¶m) ``rotmg`` supports the following precisions. @@ -35,11 +25,9 @@ rotmg .. container:: section - :name: GUID-5525F11C-A739-487E-A7CC-6886A088035D .. rubric:: Description - :name: description :class: sectiontitle @@ -53,16 +41,28 @@ rotmg | |image0| +rotmg (Buffer Version) +---------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::rotmg(sycl::queue &queue, sycl::buffer &d1, sycl::buffer &d2, sycl::buffer &x1, sycl::buffer &y1, sycl::buffer ¶m) + .. container:: section - :name: GUID-21946B3A-A859-4293-8EE7-965328AA6717 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. @@ -85,11 +85,9 @@ rotmg .. container:: section - :name: GUID-1C0481DB-BB35-4DB7-941F-649EDAA77C6F .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -102,7 +100,7 @@ rotmg x1 - Buffer holding the *x*-coordinate of the rotated vector before + Buffer holding the ``x``-coordinate of the rotated vector before scaling @@ -144,22 +142,134 @@ rotmg be set in the ``param`` vector. -.. container:: familylinks +rotmg (USM Version) +------------------- +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-1-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::rotmg(sycl::queue &queue, T *d1, T *d2, T *x1, T *y1, T *param, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. -.. |image0| image:: ../equations/GUID-D6A2FFBB-116D-4A37-A278-47F163915ee1.png -.. |image1| image:: ../equations/GUID-D6A2FFBB-116D-4A37-A278-47F163915ee2.png -.. |image2| image:: ../equations/GUID-D6A2FFBB-116D-4A37-A278-47F163915ee3.png -.. |image3| image:: ../equations/GUID-D6A2FFBB-116D-4A37-A278-47F163915ee4.png -.. |image4| image:: ../equations/GUID-D6A2FFBB-116D-4A37-A278-47F163915ee5.png + d1 + Pointer to the scaling factor for the ``x``-coordinate of the + input vector. + + + d2 + Pointer to the scaling factor for the ``y``-coordinate of the + input vector. + + + x1 + Pointer to the ``x``-coordinate of the input vector. + + + y1 + Scalar specifying the ``y``-coordinate of the input vector. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + d1 + Pointer to the first diagonal element of the updated matrix. + + + d2 + Pointer to the second diagonal element of the updated matrix. + + + x1 + Pointer to the ``x``-coordinate of the rotated vector before + scaling + + + param + Pointer to an array of size 5. + + + The elements of the ``param`` array are: + + + ``param[0]`` contains a switch, ``flag``. The other array + elements ``param[1-4]`` contain the components of the array + ``H``: ``h``\ :sub:`11`, ``h``\ :sub:`21`, ``h``\ :sub:`12`, + and ``h``\ :sub:`22`, respectively. + + + Depending on the values of ``flag``, the components of ``H`` + are set as follows: + + + | ``flag =``\ ``-1.0``: + | |image1| + + + | ``flag =``\ ``0.0``: + | |image2| + + + | ``flag =``\ ``1.0``: + | |image3| + + + | ``flag =``\ ``-2.0``: + | |image4| + + + In the last three cases, the matrix entries of 1.0, -1.0, and + 0.0 are assumed based on the value of ``flag`` and are not + required to be set in the ``param`` vector. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-1-routines` +.. |image0| image:: ../equations/GUID-DA21ECDC-F63E-4971-BA3F-492E69335ee1.png +.. |image1| image:: ../equations/GUID-DA21ECDC-F63E-4971-BA3F-492E69335ee2.png +.. |image2| image:: ../equations/GUID-DA21ECDC-F63E-4971-BA3F-492E69335ee3.png +.. |image3| image:: ../equations/GUID-DA21ECDC-F63E-4971-BA3F-492E69335ee4.png +.. |image4| image:: ../equations/GUID-DA21ECDC-F63E-4971-BA3F-492E69335ee5.png diff --git a/docs/domains/blas/sbmv.rst b/docs/domains/blas/sbmv.rst index b28b9b027..e376818b0 100644 --- a/docs/domains/blas/sbmv.rst +++ b/docs/domains/blas/sbmv.rst @@ -1,4 +1,4 @@ -.. _sbmv: +.. _onemkl_blas_sbmv: sbmv ==== @@ -10,16 +10,6 @@ sbmv Computes a matrix-vector product with a symmetric band matrix. - .. container:: section - :name: GUID-BEDE7E82-C168-498D-BF65-085BBCEF9A27 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void sbmv(queue &exec_queue, uplo upper_lower, std::int64_t n, std::int64_t k, T alpha, buffer &a, std::int64_t lda, buffer &x, std::int64_t incx, T beta, buffer &y, std::int64_t incy) ``sbmv`` supports the following precisions. @@ -35,22 +25,17 @@ sbmv .. container:: section - :name: GUID-4F227157-1724-4D1F-AFAB-58C722CA8D08 .. rubric:: Description - :name: description :class: sectiontitle - The sbmv routines compute a scalar-matrix-vector product and add the + The ``sbmv`` routines compute a scalar-matrix-vector product and add the result to a scalar-vector product, with a symmetric band matrix. The operation is defined as - - - y <- alpha*A*x + beta*y @@ -67,23 +52,33 @@ sbmv ``x`` and ``y`` are vectors of length ``n``. +sbmv (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::sbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, T alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &x, std::int64_t incx, T beta, sycl::buffer &y, std::int64_t incy) + .. container:: section - :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. upper_lower - Specifies whether ``A`` is upper or lower triangular. See - :ref:`onemkl_datatypes` for more - details. + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. n @@ -138,11 +133,9 @@ sbmv .. container:: section - :name: GUID-ABBEA4DA-7B4C-489A-8063-BDC09FBB1ADD .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -150,15 +143,121 @@ sbmv Buffer holding the updated vector ``y``. -.. container:: familylinks +sbmv (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-2-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::sbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, T alpha, const T *a, std::int64_t lda, const T *x, std::int64_t incx, T beta, T *y, std::int64_t incy, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + upper_lower + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. + + + n + Number of rows and columns of ``A``. Must be at least zero. + + + k + Number of super-diagonals of the matrix ``A``. Must be at least + zero. + + alpha + Scaling factor for the matrix-vector product. + + + a + Pointer to input matrix ``A``. The array holding input matrix + ``A`` must have size at least ``lda``\ \*\ ``n``. See `Matrix + and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + lda + Leading dimension of matrix ``A``. Must be at least (``k`` + + 1), and positive. + + + x + Pointer to input vector ``x``. The array holding input vector + ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)). + See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + beta + Scaling factor for vector ``y``. + + + y + Pointer to input/output vector ``y``. The array holding + input/output vector ``y`` must be of size at least (1 + (``n`` + - 1)*abs(``incy``)). See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incy + Stride of vector ``y``. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + y + Pointer to the updated vector ``y``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-2-routines` diff --git a/docs/domains/blas/scal.rst b/docs/domains/blas/scal.rst index 97075eac6..ad4e9e753 100644 --- a/docs/domains/blas/scal.rst +++ b/docs/domains/blas/scal.rst @@ -1,4 +1,4 @@ -.. _scal: +.. _onemkl_blas_scal: scal ==== @@ -10,16 +10,6 @@ scal Computes the product of a vector by a scalar. - .. container:: section - :name: GUID-178A4C6A-3BA5-40F7-A3D6-4B6590B75EB4 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void scal(queue &exec_queue, std::int64_t n, T_scalar alpha, buffer &x, std::int64_t incx) ``scal`` supports the following precisions. @@ -46,18 +36,13 @@ scal .. container:: section - :name: GUID-8DDCA613-2750-43D0-A89B-13866F2DDE8C .. rubric:: Description - :name: description :class: sectiontitle - The scal routines computes a scalar-vector product: - - - + The ``scal`` routines computes a scalar-vector product: x <- alpha*x @@ -72,16 +57,28 @@ scal ``alpha`` is a scalar. +scal (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::scal(sycl::queue &queue, std::int64_t n, T_scalar alpha, sycl::buffer &x, std::int64_t incx) + .. container:: section - :name: GUID-A615800D-734E-4997-BB91-1C76AEEE9EC2 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. @@ -101,15 +98,13 @@ scal incx - Stride of vector x. + Stride of vector ``x``. .. container:: section - :name: GUID-B36EBB3E-C79B-49F8-9F47-7B19BD6BE105 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -117,15 +112,77 @@ scal Buffer holding updated buffer ``x``. -.. container:: familylinks +scal (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-1-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::scal(sycl::queue &queue, std::int64_t n, T_scalar alpha, T *x, std::int64_t incx, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + queue + The queue where the routine should be executed. + + + n + Number of elements in vector ``x``. + + + alpha + Specifies the scalar ``alpha``. + + + x + Pointer to the input vector ``x``. The array must be of size at + least (1 + (``n`` - 1)*abs(``incx``)). See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + x + Pointer to the updated array ``x``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-1-routines` diff --git a/docs/domains/blas/sdsdot.rst b/docs/domains/blas/sdsdot.rst index 11414fd5b..76ba70039 100644 --- a/docs/domains/blas/sdsdot.rst +++ b/docs/domains/blas/sdsdot.rst @@ -1,4 +1,4 @@ -.. _sdsdot: +.. _onemkl_blas_sdsdot: sdsdot ====== @@ -10,35 +10,41 @@ sdsdot Computes a vector-vector dot product with double precision. - .. container:: section - :name: GUID-2DDFDC38-65FA-40F5-AACB-8E383623EF4A - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle + .. container:: section - .. cpp:function:: void sdsdot(queue &exec_queue, std::int64_t n, float sb, buffer &x, std::int64_t incx, buffer &y, std::int64_t incy, buffer &result) .. rubric:: Description - :name: description :class: sectiontitle - The sdsdot routines perform a dot product between two vectors with + The ``sdsdot`` routines perform a dot product between two vectors with double precision: |image0| +sdsdot (Buffer Version) +----------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + .. cpp:function:: void onemkl::blas::sdsdot(sycl::queue &queue, std::int64_t n, float sb, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, std::int64_t incy, sycl::buffer &result) + .. container:: section + + .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. @@ -59,7 +65,7 @@ sdsdot incx - Stride of vector x. + Stride of vector ``x``. y @@ -71,11 +77,12 @@ sdsdot incy - Stride of vector y. - + Stride of vector ``y``. + .. container:: section + + .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -83,11 +90,95 @@ sdsdot Buffer where the result (a scalar) will be stored. If ``n`` < 0 the result is ``sb``. +sdsdot (USM Version) +-------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. container:: dlsyntaxpara - **Parent topic:** :ref:`blas-level-1-routines` + + .. cpp:function:: sycl::event onemkl::blas::sdsdot(sycl::queue &queue, std::int64_t n, float sb, const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *result, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + n + Number of elements in vectors ``x`` and ``y``. + + + sb + Single precision scalar to be added to the dot product. + + + x + Pointer to the input vector ``x``. The array must be of size + at least (1 + (``n`` - 1)*abs(``incx``)). See `Matrix and + Vector + Storage <../matrix-storage.html>`__ + for more details. + + + incx + Stride of vector ``x``. + + + y + Pointer to the input vector ``y``. The array must be of size + at least (1 + (``n`` - 1)*abs(``incxy``)). See `Matrix and + Vector + Storage <../matrix-storage.html>`__ + for more details. + + + incy + Stride of vector ``y``. + + + dependencies + List of events to wait for before starting computation, if + any. If omitted, defaults to no dependencies. + + .. container:: section + + .. rubric:: Output Parameters + :class: sectiontitle + + + result + Pointer to where the result (a scalar) will be stored. If + ``n`` < 0 the result is ``sb``. + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + +.. container:: familylinks + + .. container:: parentlink -.. |image0| image:: ../equations/GUID-9DB212E1-03E2-430C-8B1F-8F5CBD4F2ee1.png + **Parent topic:** :ref:`blas-level-1-routines` +.. |image0| image:: ../equations/GUID-9B91DAAE-72DD-4799-9983-12B021993ee1.png :class: img-middle diff --git a/docs/domains/blas/spmv.rst b/docs/domains/blas/spmv.rst index 0b1690df5..c744be625 100644 --- a/docs/domains/blas/spmv.rst +++ b/docs/domains/blas/spmv.rst @@ -1,4 +1,4 @@ -.. _spmv: +.. _onemkl_blas_spmv: spmv ==== @@ -10,16 +10,6 @@ spmv Computes a matrix-vector product with a symmetric packed matrix. - .. container:: section - :name: GUID-BCC82B03-92EB-4D73-B69C-8AE8646FBEAC - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void spmv(queue &exec_queue, uplo upper_lower, std::int64_t n, T alpha, buffer &a, buffer &x, std::int64_t incx, T beta, buffer &y, std::int64_t incy) ``spmv`` supports the following precisions. @@ -35,22 +25,17 @@ spmv .. container:: section - :name: GUID-D27BBFFF-79F4-4236-96A6-B305FA1858B0 .. rubric:: Description - :name: description :class: sectiontitle - The spmv routines compute a scalar-matrix-vector product and add the + The ``spmv`` routines compute a scalar-matrix-vector product and add the result to a scalar-vector product, with a symmetric packed matrix. The operation is defined as - - - y <- alpha*A*x + beta*y @@ -66,23 +51,33 @@ spmv ``x`` and ``y`` are vectors of length ``n``. +spmv (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::spmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, sycl::buffer &a, sycl::buffer &x, std::int64_t incx, T beta, sycl::buffer &y, std::int64_t incy) + .. container:: section - :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. upper_lower - Specifies whether ``A`` is upper or lower triangular. See - :ref:`onemkl_datatypes` for more - details. + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. n @@ -127,11 +122,9 @@ spmv .. container:: section - :name: GUID-23FF1F5C-5560-40B6-807D-B6352FA320D6 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -139,15 +132,111 @@ spmv Buffer holding the updated vector ``y``. -.. container:: familylinks +spmv (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-2-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::spmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, const T *a, const T *x, std::int64_t incx, T beta, T *y, std::int64_t incy, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + upper_lower + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. + + + n + Number of rows and columns of ``A``. Must be at least zero. + + alpha + Scaling factor for the matrix-vector product. + + + a + Pointer to input matrix ``A``. The array holding input matrix + ``A`` must have size at least (``n``\ \*(``n``\ +1))/2. See + `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + x + Pointer to input vector ``x``. The array holding input vector + ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)). + See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + beta + Scaling factor for vector ``y``. + + + y + Pointer to input/output vector ``y``. The array holding + input/output vector ``y`` must be of size at least (1 + (``n`` + - 1)*abs(``incy``)). See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incy + Stride of vector ``y``. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + y + Pointer to the updated vector ``y``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-2-routines` diff --git a/docs/domains/blas/spr.rst b/docs/domains/blas/spr.rst index 0112706b6..4b3c97e4d 100644 --- a/docs/domains/blas/spr.rst +++ b/docs/domains/blas/spr.rst @@ -1,4 +1,4 @@ -.. _spr: +.. _onemkl_blas_spr: spr === @@ -10,16 +10,6 @@ spr Performs a rank-1 update of a symmetric packed matrix. - .. container:: section - :name: GUID-34904813-AFD9-4349-9DAC-A7221FBE9F97 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void spr(queue &exec_queue, uplo upper_lower, std::std::int64_t n, T alpha, buffer &x, std::int64_t incx, buffer &a) ``spr`` supports the following precisions. @@ -32,24 +22,17 @@ spr * - ``double`` - - .. container:: section - :name: GUID-E387B33A-CA59-45D8-BB01-31DF76C82A0D .. rubric:: Description - :name: description :class: sectiontitle - The spr routines compute a scalar-vector-vector product and add the + The ``spr`` routines compute a scalar-vector-vector product and add the result to a symmetric packed matrix. The operation is defined as - - - A <- alpha*x*x :sup:`T` + A @@ -65,23 +48,33 @@ spr ``x`` is a vector of length ``n``. +spr (Buffer Version) +-------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::spr(sycl::queue &queue, uplo upper_lower, std::std::int64_t n, T alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &a) + .. container:: section - :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. upper_lower - Specifies whether ``A`` is upper or lower triangular. See - :ref:`onemkl_datatypes` for more - details. + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. n @@ -111,35 +104,111 @@ spr .. container:: section - :name: GUID-9FBC2F3B-EB8F-4733-ABBA-08D5685A761B .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle - **sycl:** -   - - - a - Buffer holding the updated upper triangularpart of the symmetric + Buffer holding the updated upper triangular part of the symmetric matrix ``A`` if ``upper_lower =upper``, or the updated lower - triangular part of thesymmetric matrix ``A`` if + triangular part of the symmetric matrix ``A`` if ``upper_lower =lower``. -.. container:: familylinks +spr (USM Version) +----------------- +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-2-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::spr(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, const T *x, std::int64_t incx, T *a, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + queue + The queue where the routine should be executed. + + + upper_lower + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. + + + n + Number of rows and columns of ``A``. Must be at least zero. + + + alpha + Scaling factor for the matrix-vector product. + + + x + Pointer to input vector ``x``. The array holding input vector + ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)). + See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + a + Pointer to input matrix ``A``. The array holding input matrix + ``A`` must have size at least (``n``\ \*(``n``-n))/2. See + `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + a + Pointer to the updated upper triangular part of the symmetric + matrix ``A`` if ``upper_lower =upper``, or the updated lower + triangular part of the symmetric matrix ``A`` if + ``upper_lower =lower``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-2-routines` diff --git a/docs/domains/blas/spr2.rst b/docs/domains/blas/spr2.rst index ca78f30ac..dd013a716 100644 --- a/docs/domains/blas/spr2.rst +++ b/docs/domains/blas/spr2.rst @@ -1,4 +1,4 @@ -.. _spr2: +.. _onemkl_blas_spr2: spr2 ==== @@ -10,16 +10,6 @@ spr2 Computes a rank-2 update of a symmetric packed matrix. - .. container:: section - :name: GUID-44B72132-1EC0-41FA-9189-4596CFD651B0 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void spr2(queue &exec_queue, uplo upper_lower, std::int64_t n, T alpha, buffer &x, std::int64_t incx, buffer &y, std::int64_t incy, buffer &a) ``spr`` supports the following precisions. @@ -35,15 +25,13 @@ spr2 .. container:: section - :name: GUID-3AF7EB4D-B3FE-4C0A-B7A0-6E286D4C642F .. rubric:: Description - :name: description :class: sectiontitle - The spr2 routines compute two scalar-vector-vector products and add + The ``spr2`` routines compute two scalar-vector-vector products and add them to a symmetric packed matrix. The operation is defined as @@ -65,23 +53,32 @@ spr2 ``x`` and ``y`` are vectors of length ``n``. +spr2 (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::spr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, std::int64_t incy, sycl::buffer &a) .. container:: section - :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. upper_lower - Specifies whether ``A`` is upper or lower triangular. See - :ref:`onemkl_datatypes` for more - details. + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. n @@ -122,35 +119,123 @@ spr2 .. container:: section - :name: GUID-9796BA93-31FB-40B9-B139-219905913736 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle - **sycl:** -   - - - a - Buffer holding the updated upper triangularpart of the symmetric + Buffer holding the updated upper triangular part of the symmetric matrix ``A`` if ``upper_lower =upper`` or the updated lower - triangular part of thesymmetric matrix ``A`` if + triangular part of the symmetric matrix ``A`` if ``upper_lower =lower``. -.. container:: familylinks +spr2 (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-2-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::spr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, const T *x, std::int64_t incx, const T *y, std::int64_t incy, T *a) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + upper_lower + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. + + n + Number of rows and columns of ``A``. Must be at least zero. + + + alpha + Scaling factor for the matrix-vector product. + + + x + Pointer to input vector ``x``. The array holding input vector + ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)). + See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + y + Pointer to input/output vector ``y``. The array holding + input/output vector ``y`` must be of size at least (1 + (``n`` + - 1)*abs(``incy``)). See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incy + Stride of vector ``y``. + + + a + Pointer to input matrix ``A``. The array holding input matrix + ``A`` must have size at least (``n``\ \*(``n``-1))/2. See + `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + a + Pointer to the updated upper triangular part of the symmetric + matrix ``A`` if ``upper_lower =upper`` or the updated lower + triangular part of the symmetric matrix ``A`` if + ``upper_lower =lower``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-2-routines` diff --git a/docs/domains/blas/swap.rst b/docs/domains/blas/swap.rst index 3d4542779..18e22a5b5 100644 --- a/docs/domains/blas/swap.rst +++ b/docs/domains/blas/swap.rst @@ -1,4 +1,4 @@ -.. _swap: +.. _onemkl_blas_swap: swap ==== @@ -10,18 +10,8 @@ swap Swaps a vector with another vector. - .. container:: section - :name: GUID-F0DF0055-DF25-4EC7-8FF2-48D4FA91E42E - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - .. cpp:function:: void swap(queue &exec_queue, std::int64_t n, buffer &x, std::int64_t incx, buffer &y, std::int64_t incy) - - swap supports the following precisions. + ``swap`` supports the following precisions. .. list-table:: @@ -34,38 +24,43 @@ swap * - ``std::complex`` - - .. container:: section - :name: GUID-FE88C4B7-4C74-41F8-94DE-E62888DD3BA4 .. rubric:: Description - :name: description :class: sectiontitle - Given two vectors of ``n`` elements, ``x`` and ``y``, the swap + Given two vectors of ``n`` elements, ``x`` and ``y``, the ``swap`` routines return vectors ``y`` and ``x`` swapped, each replacing the other. - + y <- x, x <- y - y <- x, x <- y +swap (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + .. cpp:function:: void onemkl::blas::swap(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, std::int64_t incy) + .. container:: section - :name: GUID-A615800D-734E-4997-BB91-1C76AEEE9EC2 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. @@ -81,7 +76,7 @@ swap incx - Stride of vector x. + Stride of vector ``x``. y @@ -92,15 +87,13 @@ swap incy - Stride of vector y. + Stride of vector ``y``. .. container:: section - :name: GUID-106AC665-DCBA-40ED-8779-0D9017064855 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -114,15 +107,95 @@ swap ``x``. -.. container:: familylinks +swap (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-1-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::swap(sycl::queue &queue, std::int64_t n, T *x, std::int64_t incx, T *y, std::int64_t incy, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + n + Number of elements in vector ``x``. + + + x + Pointer to the input vector ``x``. The array must be of size at + least (1 + (``n`` - 1)*abs(``incx``)). See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + y + Pointer to the input vector ``y``. The array must be of size at + least (1 + (``n`` - 1)*abs(``incy``)). See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incy + Stride of vector ``y``. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + x + Pointer to the updated array ``x``, that is, the input vector + ``y``. + + + y + Pointer to the updated array ``y``, that is, the input vector + ``x``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-1-routines` diff --git a/docs/domains/blas/symm.rst b/docs/domains/blas/symm.rst index c14d9d2bf..87b1252de 100644 --- a/docs/domains/blas/symm.rst +++ b/docs/domains/blas/symm.rst @@ -1,4 +1,4 @@ -.. _symm: +.. _onemkl_blas_symm: symm ==== @@ -11,18 +11,8 @@ symm and one matrix is general. - .. container:: section - :name: GUID-BFE36A6B-941E-4B49-AB0E-CFB687B1AD64 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void symm(queue &exec_queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, T alpha, buffer &a, std::int64_t lda, buffer &b, std::int64_t ldb, T beta, buffer &c, std::int64_t ldc) - symm supports the following precisions. + ``symm`` supports the following precisions. .. list-table:: @@ -35,18 +25,14 @@ symm * - ``std::complex`` - - .. container:: section - :name: GUID-E8FE37B0-C527-4AA6-B57F-AE3F4843F23A .. rubric:: Description - :name: description :class: sectiontitle - The symm routines compute a scalar-matrix-matrix product and add the + The ``symm`` routines compute a scalar-matrix-matrix product and add the result to a scalar-matrix product, where one of the matrices in the multiplication is symmetric. The argument ``left_right`` determines if the symmetric matrix, ``A``, is on the left of the multiplication @@ -55,18 +41,11 @@ symm defined as - - - C <- alpha*A*B + beta*C, - or - - - C <- alpha*B*A + beta*C, @@ -82,31 +61,39 @@ symm ``B`` and ``C`` are ``m``-by-``n`` matrices. +symm (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, T alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &b, std::int64_t ldb, T beta, sycl::buffer &c, std::int64_t ldc) + .. container:: section - :name: GUID-70716375-C54E-4AA6-94DC-65AF79D46BB2 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. left_right Specifies whether ``A`` is on the left side of the multiplication - (``side::left``) or on the right side (``side::right``). See - :ref:`onemkl_datatypes` for more - details. + (``side::left``) or on the right side (``side::right``). See :ref:`onemkl_datatypes` for more details. upper_lower - Specifies whether *A*'s data is stored in its upper or lower - triangle. See - :ref:`onemkl_datatypes` for more - details. + Specifies whether ``A``'s data is stored in its upper or lower + triangle. See :ref:`onemkl_datatypes` for more details. m @@ -164,11 +151,9 @@ symm .. container:: section - :name: GUID-DD569858-5D3C-4565-8BAB-FE548427DCF2 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -180,11 +165,9 @@ symm .. container:: section - :name: EXAMPLE_5EF48B8A07D849EA84A74FE22F0D5B24 .. rubric:: Notes - :name: notes :class: sectiontitle @@ -192,15 +175,146 @@ symm calling ``symm``. -.. container:: familylinks +symm (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-3-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, T alpha, const T* a, std::int64_t lda, const T* b, std::int64_t ldb, T beta, T* c, std::int64_t ldc, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + left_right + Specifies whether ``A`` is on the left side of the + multiplication (``side::left``) or on the right side + (``side::right``). See :ref:`onemkl_datatypes` for more details. + + + upper_lower + Specifies whether ``A``'s data is stored in its upper or lower + triangle. See :ref:`onemkl_datatypes` for more details. + + + m + Number of rows of ``B`` and ``C``. The value of ``m`` must be + at least zero. + + + n + Number of columns of ``B`` and ``C``. The value of ``n`` must + be at least zero. + + alpha + Scaling factor for the matrix-matrix product. + + + a + Pointer to input matrix ``A``. Must have size at least + ``lda``\ \*\ ``m`` if ``A`` is on the left of the + multiplication, or ``lda``\ \*\ ``n`` if ``A`` is on the right. + See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + lda + Leading dimension of ``A``. Must be at least ``m`` if ``A`` is + on the left of the multiplication, or at least ``n`` if ``A`` + is on the right. Must be positive. + + + b + Pointer to input matrix ``B``. Must have size at least + ``ldb``\ \*\ ``n``. See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + ldb + Leading dimension of ``B``. Must be positive and at least + ``m``. + + + beta + Scaling factor for matrix ``C``. + + + c + Pointer to input/output matrix ``C``. Must have size at least + ``ldc``\ \*\ ``n``. See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + ldc + Leading dimension of ``C``. Must be positive and at least + ``m``. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + c + Pointer to the output matrix, overwritten by + ``alpha``\ \*\ ``A``\ \*\ ``B`` + ``beta``\ \*\ ``C`` + (``left_right`` = ``side::left``) or + ``alpha``\ \*\ ``B``\ \*\ ``A`` + ``beta``\ \*\ ``C`` + (``left_right`` = ``side::right``). + + + .. container:: section + + + .. rubric:: Notes + :class: sectiontitle + + + If ``beta`` = 0, matrix ``C`` does not need to be initialized + before calling ``symm``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-3-routines` diff --git a/docs/domains/blas/symv.rst b/docs/domains/blas/symv.rst index 8d59ed90a..d7abca890 100644 --- a/docs/domains/blas/symv.rst +++ b/docs/domains/blas/symv.rst @@ -1,4 +1,4 @@ -.. _symv: +.. _onemkl_blas_symv: symv ==== @@ -10,16 +10,6 @@ symv Computes a matrix-vector product for a symmetric matrix. - .. container:: section - :name: GUID-1E9C9EA9-0366-420E-A704-AB605C8ED92A - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void symv(queue &exec_queue, uplo upper_lower, std::int64_t n, T alpha, buffer &a, std::int64_t lda, buffer &x, std::int64_t incx, T beta, buffer &y, std::int64_t incy) ``symv`` supports the following precisions. @@ -32,25 +22,18 @@ symv * - ``double`` - - .. container:: section - :name: GUID-DE8D8321-D53D-4226-A940-CDE0E720EC95 .. rubric:: Description - :name: description :class: sectiontitle - The symv routines routines compute a scalar-matrix-vector product and + The ``symv`` routines routines compute a scalar-matrix-vector product and add the result to a scalar-vector product, with a symmetric matrix. The operation is defined as - - - y <- alpha*A*x + beta*y @@ -66,23 +49,32 @@ symv ``x`` and ``y`` are vectors of length ``n``. +symv (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::symv(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &x, std::int64_t incx, T beta, sycl::buffer &y, std::int64_t incy) .. container:: section - :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. upper_lower - Specifies whether ``A`` is upper or lower triangular. See - :ref:`onemkl_datatypes` for more - details. + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. n @@ -128,11 +120,9 @@ symv .. container:: section - :name: GUID-E16C8443-A2A4-483C-9D46-FF428E80FEB0 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -140,15 +130,112 @@ symv Buffer holding the updated vector ``y``. -.. container:: familylinks +symv (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-2-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::symv(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, const T *a, std::int64_t lda, const T *x, std::int64_t incx, T beta, T *y, std::int64_t incy, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + upper_lower + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. + + + n + Number of rows and columns of ``A``. Must be at least zero. + + alpha + Scaling factor for the matrix-vector product. + + + a + Pointer to input matrix ``A``. The array holding input matrix + ``A`` must have size at least ``lda``\ \*\ ``n``. See `Matrix + and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + lda + Leading dimension of matrix ``A``. Must be at least ``m``, and + positive. + + + x + Pointer to input vector ``x``. The array holding input vector + ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)). + See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + y + Pointer to input/output vector ``y``. The array holding + input/output vector ``y`` must be of size at least (1 + (``n`` + - 1)*abs(``incy``)). See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incy + Stride of vector ``y``. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + y + Pointer to the updated vector ``y``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-2-routines` diff --git a/docs/domains/blas/syr.rst b/docs/domains/blas/syr.rst index e3ff12e6b..8eca1cfe3 100644 --- a/docs/domains/blas/syr.rst +++ b/docs/domains/blas/syr.rst @@ -1,4 +1,4 @@ -.. _syr: +.. _onemkl_blas_syr: syr === @@ -10,16 +10,6 @@ syr Computes a rank-1 update of a symmetric matrix. - .. container:: section - :name: GUID-E620D36F-6B4E-40A6-8BDA-3D625DEF55A8 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void syr(queue &exec_queue, uplo upper_lower, std::int64_t n, T alpha, buffer &x, std::int64_t incx, buffer &a, std::int64_t lda) ``syr`` supports the following precisions. @@ -32,25 +22,18 @@ syr * - ``double`` - - .. container:: section - :name: GUID-E154DE4B-4559-4471-B92B-46AF8777AC97 .. rubric:: Description - :name: description :class: sectiontitle - The syr routines compute a scalar-vector-vector product add them and + The ``syr`` routines compute a scalar-vector-vector product add them and add the result to a matrix, with a symmetric matrix. The operation is defined as - - - A <- alpha*x*x :sup:`T` + A @@ -66,23 +49,33 @@ syr ``x`` is a vector of length ``n``. +syr (Buffer Version) +-------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::syr(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &a, std::int64_t lda) + .. container:: section - :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. upper_lower - Specifies whether ``A`` is upper or lower triangular. See - :ref:`onemkl_datatypes` for more - details. + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. n @@ -117,30 +110,116 @@ syr .. container:: section - :name: GUID-C03D1215-FD77-4AD8-8FA2-C48A5D8B938C .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle a - Buffer holding the updated upper triangularpart of the symmetric + Buffer holding the updated upper triangular part of the symmetric matrix ``A`` if ``upper_lower =upper`` or the updated lower - triangular part of thesymmetric matrix ``A`` if + triangular part of the symmetric matrix ``A`` if ``upper_lower =lower``. -.. container:: familylinks +syr (USM Version) +----------------- +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-2-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::syr(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, const T *x, std::int64_t incx, T *a, std::int64_t lda, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + upper_lower + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. + + + n + Number of columns of ``A``. Must be at least zero. + + + alpha + Scaling factor for the matrix-vector product. + + + x + Pointer to input vector ``x``. The array holding input vector + ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)). + See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + a + Pointer to input matrix ``A``. The array holding input matrix + ``A`` must have size at least ``lda``\ \*\ ``n``. See `Matrix + and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + lda + Leading dimension of matrix ``A``. Must be at least ``n``, and + positive. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + a + Pointer to the updated upper triangular part of the symmetric + matrix ``A`` if ``upper_lower =upper`` or the updated lower + triangular part of the symmetric matrix ``A`` if + ``upper_lower =lower``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-2-routines` diff --git a/docs/domains/blas/syr2.rst b/docs/domains/blas/syr2.rst index 6459801cf..acded7ff8 100644 --- a/docs/domains/blas/syr2.rst +++ b/docs/domains/blas/syr2.rst @@ -1,4 +1,4 @@ -.. _syr2: +.. _onemkl_blas_syr2: syr2 ==== @@ -10,16 +10,6 @@ syr2 Computes a rank-2 update of a symmetric matrix. - .. container:: section - :name: GUID-580F2222-D47E-43A3-B9A2-037F353825D5 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void syr2(queue &exec_queue, uplo upper_lower, std::int64_t n, T alpha, buffer &x, std::int64_t incx, buffer &y, std::int64_t incy, buffer &a, std::int64_t lda) ``syr2`` supports the following precisions. @@ -32,25 +22,18 @@ syr2 * - ``double`` - - .. container:: section - :name: GUID-CDA05459-F2FE-4933-A552-D6E52EC46D13 .. rubric:: Description - :name: description :class: sectiontitle - The syr2 routines compute two scalar-vector-vector product add them + The ``syr2`` routines compute two scalar-vector-vector product add them and add the result to a matrix, with a symmetric matrix. The operation is defined as - - - A <- alpha*x*y :sup:`T` + alpha*y*x :sup:`T` + A @@ -66,23 +49,33 @@ syr2 ``x`` and ``y`` are vectors of length ``n``. +syr2 (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::syr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, std::int64_t incy, sycl::buffer &a, std::int64_t lda) + .. container:: section - :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. upper_lower - Specifies whether ``A`` is upper or lower triangular. See - :ref:`onemkl_datatypes` for more - details. + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. n @@ -128,30 +121,128 @@ syr2 .. container:: section - :name: GUID-6992A39F-8AB7-42D9-B126-4F8ECF9C1ECE .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle a - Buffer holding the updated upper triangularpart of the symmetric + Buffer holding the updated upper triangular part of the symmetric matrix ``A`` if ``upper_lower =upper``, or the updated lower - triangular part of thesymmetric matrix ``A`` if + triangular part of the symmetric matrix ``A`` if ``upper_lower =lower``. -.. container:: familylinks +syr2 (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-2-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::syr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, T alpha, const T *x, std::int64_t incx, const T *y, std::int64_t incy, T *a, std::int64_t lda, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + upper_lower + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. + + n + Number of columns of ``A``. Must be at least zero. + + + alpha + Scaling factor for the matrix-vector product. + + + x + Pointer to input vector ``x``. The array holding input vector + ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)). + See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + y + Pointer to input/output vector ``y``. The array holding + input/output vector ``y`` must be of size at least (1 + (``n`` + - 1)*abs(``incy``)). See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incy + Stride of vector ``y``. + + + a + Pointer to input matrix ``A``. The array holding input matrix + ``A`` must have size at least ``lda``\ \*\ ``n``. See `Matrix + and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + lda + Leading dimension of matrix ``A``. Must be at least ``n``, and + positive. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + a + Pointer to the updated upper triangular part of the symmetric + matrix ``A`` if ``upper_lower =upper``, or the updated lower + triangular part of the symmetric matrix ``A`` if + ``upper_lower =lower``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-2-routines` diff --git a/docs/domains/blas/syr2k.rst b/docs/domains/blas/syr2k.rst index e5687c856..3299e5da8 100644 --- a/docs/domains/blas/syr2k.rst +++ b/docs/domains/blas/syr2k.rst @@ -1,4 +1,4 @@ -.. _syr2k: +.. _onemkl_blas_syr2k: syr2k ===== @@ -10,18 +10,8 @@ syr2k Performs a symmetric rank-2k update. - .. container:: section - :name: GUID-EED2648B-6435-4DD1-AC36-21039DFC61DD - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - .. cpp:function:: void syr2k(queue &exec_queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, T alpha, buffer &a, std::int64_t lda, buffer &b, std::int64_t ldb, T beta, buffer &c, std::int64_t ldc) - - syr2k supports the following precisions: + ``syr2k`` supports the following precisions: .. list-table:: @@ -37,35 +27,27 @@ syr2k .. container:: section - :name: GUID-1FB46B8F-1B13-4A6B-A3A5-0A5B34049068 .. rubric:: Description - :name: description :class: sectiontitle - The syr2k routines perform a rank-2k update of an ``n`` x ``n`` + The ``syr2k`` routines perform a rank-2k update of an ``n`` x ``n`` symmetric matrix ``C`` by general matrices ``A`` and ``B``. If ``trans`` = ``transpose::nontrans``, the operation is defined as: - - - C <- alpha*(A*B :sup:`T` + B*A :sup:`T`) + beta*C where ``A`` is ``n`` x ``k`` and ``B`` is ``k`` x ``n``. - If ``trans`` = ``transpose::trans``, the operationis defined as: - - - + If ``trans`` = ``transpose::trans``, the operation is defined as: - C <- alpha*(A :sup:`T`*B + B :sup:`T`*A) + beta*C + C <- alpha*(A :sup:`T` * B + B :sup:`T` * A) + beta * C where ``A`` is ``k`` x ``n`` and ``B`` is ``n`` x ``k``. @@ -83,24 +65,34 @@ syr2k The inner dimension of both matrix multiplications is ``k``. +syr2k (Buffer Version) +---------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, T alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &b, std::int64_t ldb, T beta, sycl::buffer &c, std::int64_t ldc) + .. container:: section - :name: GUID-3EBEFBDD-93AF-4376-9BA2-A7042179BF13 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. upper_lower Specifies whether ``A``'s data is stored in its upper or lower - triangle. See - :ref:`onemkl_datatypes` for more - details. + triangle. See :ref:`onemkl_datatypes` for more details. trans @@ -170,27 +162,148 @@ syr2k .. container:: section - :name: GUID-5779F783-54BC-4887-9CBB-96B8EC9F00E9 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle c - Output buffer, overwritten by the updated C matrix. + Output buffer, overwritten by the updated ``C`` matrix. -.. container:: familylinks +syr2k (USM Version) +------------------- +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-3-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, T alpha, const T* a, std::int64_t lda, const T* b, std::int64_t ldb, T beta, T* c, std::int64_t ldc, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + upper_lower + Specifies whether ``A``'s data is stored in its upper or lower + triangle. See :ref:`onemkl_datatypes` for more details. + + + trans + Specifies the operation to apply, as described above. + Conjugation is never performed, even if ``trans`` = + ``transpose::conjtrans``. + + + n + Number of rows and columns in ``C``. The value of ``n`` must be + at least zero. + + k + Inner dimension of matrix multiplications.The value of ``k`` + must be at least zero. + + + alpha + Scaling factor for the rank-2\ ``k`` update. + + + a + Pointer to input matrix ``A``. If ``A`` is not transposed, + ``A`` is an ``m``-by-``k`` matrix so the array ``a`` must have + size at least ``lda``\ \*\ ``k``. If ``A`` is transposed, ``A`` + is an ``k``-by-``m`` matrix so the array ``a`` must have size + at least ``lda``\ \*\ ``m``. See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + lda + Leading dimension of ``A``. Must be at least ``n`` if ``trans`` + = ``transpose::nontrans``, and at least ``k`` otherwise. Must + be positive. + + + b + Pointer to input matrix ``B``. If ``trans`` = + ``transpose::nontrans``, ``B`` is an ``k``-by-``n`` matrix so + the array ``b`` must have size at least ``ldb``\ \*\ ``n``. + Otherwise, ``B`` is an ``n``-by-``k`` matrix so the array ``b`` + must have size at least ``ldb``\ \*\ ``k``. See `Matrix and + Vector + Storage <../matrix-storage.html>`__ for + more details. + + + ldb + Leading dimension of ``B``. Must be at least ``k`` if ``trans`` + = ``transpose::nontrans``, and at least ``n`` otherwise. Must + be positive. + + + beta + Scaling factor for matrix ``C``. + + + c + Pointer to input/output matrix ``C``. Must have size at least + ``ldc``\ \*\ ``n``. See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details + + + ldc + Leading dimension of ``C``. Must be positive and at least + ``n``. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + c + Pointer to the output matrix, overwritten by the updated ``C`` + matrix. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-3-routines` diff --git a/docs/domains/blas/syrk.rst b/docs/domains/blas/syrk.rst index d097db2dd..aa94d0767 100644 --- a/docs/domains/blas/syrk.rst +++ b/docs/domains/blas/syrk.rst @@ -1,4 +1,4 @@ -.. _syrk: +.. _onemkl_blas_syrk: syrk ==== @@ -10,18 +10,8 @@ syrk Performs a symmetric rank-k update. - .. container:: section - :name: GUID-F8123F9B-A182-4BDB-A1A3-90FEC4F56231 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - .. cpp:function:: void syrk(queue &exec_queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, T alpha, buffer &a, std::int64_t lda, T beta, buffer &c, std::int64_t ldc) - - syrk supports the following precisions. + ``syrk`` supports the following precisions. .. list-table:: @@ -34,23 +24,17 @@ syrk * - ``std::complex`` - - .. container:: section - :name: GUID-8E133139-EE58-44B8-A507-2263BDD1399B .. rubric:: Description - :name: description :class: sectiontitle - The syrk routines perform a rank-k update of a symmetric matrix ``C`` + The ``syrk`` routines perform a rank-k update of a symmetric matrix ``C`` by a general matrix ``A``. The operation is defined as: - - C <- alpha*op(A)*op(A)T + beta*C @@ -71,31 +55,38 @@ syrk Here op(``A``) is ``n``-by-``k``, and ``C`` is ``n``-by-``n``. +syrk (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, T alpha, sycl::buffer &a, std::int64_t lda, T beta, sycl::buffer &c, std::int64_t ldc) + .. container:: section - :name: GUID-96D007CC-23F0-46FA-9085-6DBFC5BB30E6 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. upper_lower Specifies whether ``A``'s data is stored in its upper or lower - triangle. See - :ref:`onemkl_datatypes` for more - details. + triangle. See :ref:`onemkl_datatypes` for more details. trans - Specifies op(``A``), the transposition operation applied to ``A`` - (See - :ref:`onemkl_datatypes` for more - details). Conjugation is never performed, even if ``trans`` = + Specifies op(``A``), the transposition operation applied to ``A`` (See :ref:`onemkl_datatypes` for more details). Conjugation is never performed, even if ``trans`` = ``transpose::conjtrans``. @@ -145,11 +136,9 @@ syrk .. container:: section - :name: GUID-E14CE68E-2E28-48BB-8FD7-B84A21563BDA .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -158,15 +147,118 @@ syrk ``alpha``\ \*op(``A``)*op(``A``)\ :sup:`T` + ``beta``\ \*\ ``C``. -.. container:: familylinks +syrk (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-3-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, T alpha, const T* a, std::int64_t lda, T beta, T* c, std::int64_t ldc, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + upper_lower + Specifies whether ``A``'s data is stored in its upper or lower + triangle. See :ref:`onemkl_datatypes` for more details. + + + trans + Specifies op(``A``), the transposition operation applied to + ``A`` (See :ref:`onemkl_datatypes` for more details). Conjugation is never performed, even if + ``trans`` = ``transpose::conjtrans``. + + n + Number of rows and columns in ``C``. The value of ``n`` must be + at least zero. + + + k + Number of columns in op(``A``). The value of ``k`` must be at + least zero. + + + alpha + Scaling factor for the rank-``k`` update. + + + a + Pointer to input matrix ``A``. If ``trans`` = + ``transpose::nontrans``, ``A`` is an ``n``-by-``k`` matrix so + the array ``a`` must have size at least ``lda``\ \*\ ``k``. + Otherwise, ``A`` is an ``k``-by-``n`` matrix so the array ``a`` + must have size at least ``lda``\ \*\ ``n``. See `Matrix and + Vector + Storage <../matrix-storage.html>`__ for + more details. + + + lda + Leading dimension of ``A``. Must be at least ``n`` if ``A`` is + not transposed, and at least ``k`` if ``A`` is transposed. Must + be positive. + + + beta + Scaling factor for matrix ``C``. + + + c + Pointer to input/output matrix ``C``. Must have size at least + ``ldc``\ \*\ ``n``. See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + ldc + Leading dimension of ``C``. Must be positive and at least + ``n``. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + c + Pointer to the output matrix, overwritten by + ``alpha``\ \*op(``A``)*op(``A``)\ :sup:`T` + + ``beta``\ \*\ ``C``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-3-routines` diff --git a/docs/domains/blas/tbmv.rst b/docs/domains/blas/tbmv.rst index eb5f7acf0..cb570220a 100644 --- a/docs/domains/blas/tbmv.rst +++ b/docs/domains/blas/tbmv.rst @@ -1,4 +1,4 @@ -.. _tbmv: +.. _onemkl_blas_tbmv: tbmv ==== @@ -10,16 +10,6 @@ tbmv Computes a matrix-vector product using a triangular band matrix. - .. container:: section - :name: GUID-BAC06253-0516-4F7F-97E6-C4CBA2DBB1A2 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void tbmv(queue &exec_queue, uplo upper_lower, transpose trans, diag unit_nonunit, std::int64_t n, std::int64_t k, buffer &a, std::int64_t lda, buffer &x, std::int64_t incx) ``tbmv`` supports the following precisions. @@ -37,21 +27,16 @@ tbmv .. container:: section - :name: GUID-4279E883-09A1-48F0-B9DA-8A1E86886B17 .. rubric:: Description - :name: description :class: sectiontitle - The tbmv routines compute a matrix-vector product with a triangular + The ``tbmv`` routines compute a matrix-vector product with a triangular band matrix. The operation is defined as - - - x <- op(A)*x @@ -69,36 +54,41 @@ tbmv ``x`` is a vector of length ``n``. +tbmv (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_nonunit, std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, sycl::buffer &x, std::int64_t incx) .. container:: section - :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. upper_lower - Specifies whether ``A`` is upper or lower triangular. See - :ref:`onemkl_datatypes` for more - details. + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. trans - Specifies op(``A``), the transposition operation applied to ``A``. - See - :ref:`onemkl_datatypes` for more - details. + Specifies op(``A``), the transposition operation applied to ``A``. See :ref:`onemkl_datatypes` for more details. + unit_nonunit - Specifies whether the matrix ``A`` is unit triangular or not. See - :ref:`onemkl_datatypes` - for more details. + Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details. n @@ -134,11 +124,9 @@ tbmv .. container:: section - :name: GUID-0B96A584-2EC7-484C-9FB0-C632053F0461 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -146,15 +134,110 @@ tbmv Buffer holding the updated vector ``x``. -.. container:: familylinks +tbmv (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-2-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_nonunit, std::int64_t n, std::int64_t k, const T *a, std::int64_t lda, T *x, std::int64_t incx, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + upper_lower + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. + + + trans + Specifies op(``A``), the transposition operation applied to + ``A``. See :ref:`onemkl_datatypes` for more details. + + unit_nonunit + Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details. + + + n + Numbers of rows and columns of ``A``. Must be at least zero. + + + k + Number of sub/super-diagonals of the matrix ``A``. Must be at + least zero. + + + a + Pointer to input matrix ``A``. The array holding input matrix + ``A`` must have size at least ``lda``\ \*\ ``n``. See `Matrix + and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + lda + Leading dimension of matrix ``A``. Must be at least (``k`` + + 1), and positive. + + + x + Pointer to input vector ``x``. The array holding input vector + ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)). + See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + x + Pointer to the updated vector ``x``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-2-routines` diff --git a/docs/domains/blas/tbsv.rst b/docs/domains/blas/tbsv.rst index 73aab67bd..03156b461 100644 --- a/docs/domains/blas/tbsv.rst +++ b/docs/domains/blas/tbsv.rst @@ -1,4 +1,4 @@ -.. _tbsv: +.. _onemkl_blas_tbsv: tbsv ==== @@ -11,16 +11,6 @@ tbsv triangular band matrix. - .. container:: section - :name: GUID-4AC7186F-2D61-44C2-95BC-5981E750A021 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void tbsv(queue &exec_queue, uplo upper_lower, transpose trans, diag unit_nonunit, std::int64_t n, std::int64_t k, buffer &a, std::int64_t lda, buffer &x, std::int64_t incx) ``tbsv`` supports the following precisions. @@ -38,22 +28,17 @@ tbsv .. container:: section - :name: GUID-5AF4221C-AB14-4F9B-97A8-CAA78DF05E36 .. rubric:: Description - :name: description :class: sectiontitle - The tbsv routines solve a system of linear equations whose + The ``tbsv`` routines solve a system of linear equations whose coefficients are in a triangular band matrix. The operation is defined as - - - op(A)*x = b @@ -71,36 +56,40 @@ tbsv ``b`` and ``x`` are vectors of length ``n``. +tbsv (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_nonunit, std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, sycl::buffer &x, std::int64_t incx) .. container:: section - :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. upper_lower - Specifies whether ``A`` is upper or lower triangular. See - :ref:`onemkl_datatypes` for more - details. + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. trans - Specifies op(``A``), the transposition operation applied to ``A``. - See - :ref:`onemkl_datatypes` for more - details. + Specifies op(``A``), the transposition operation applied to ``A``. See :ref:`onemkl_datatypes` for more details. unit_nonunit - Specifies whether the matrix ``A`` is unit triangular or not. See - :ref:`onemkl_datatypes` - for more details. + Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details. n @@ -136,11 +125,9 @@ tbsv .. container:: section - :name: GUID-24B3C6B8-7FBD-4B24-84F2-242635B3026E .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -148,15 +135,110 @@ tbsv Buffer holding the solution vector ``x``. -.. container:: familylinks +tbsv (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-2-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_nonunit, std::int64_t n, std::int64_t k, const T *a, std::int64_t lda, T *x, std::int64_t incx, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + upper_lower + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. + + + trans + Specifies op(``A``), the transposition operation applied to + ``A``. See :ref:`onemkl_datatypes` for more details. + + unit_nonunit + Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details. + + + n + Number of rows and columns of ``A``. Must be at least zero. + + + k + Number of sub/super-diagonals of the matrix ``A``. Must be at + least zero. + + + a + Pointer to input matrix ``A``. The array holding input matrix + ``A`` must have size at least ``lda``\ \*\ ``n``. See `Matrix + and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + lda + Leading dimension of matrix ``A``. Must be at least (``k`` + + 1), and positive. + + + x + Pointer to input vector ``x``. The array holding input vector + ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)). + See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + x + Pointer to the solution vector ``x``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-2-routines` diff --git a/docs/domains/blas/tpmv.rst b/docs/domains/blas/tpmv.rst index f8bd2b136..ac49c38f0 100644 --- a/docs/domains/blas/tpmv.rst +++ b/docs/domains/blas/tpmv.rst @@ -1,4 +1,4 @@ -.. _tpmv: +.. _onemkl_blas_tpmv: tpmv ==== @@ -10,16 +10,6 @@ tpmv Computes a matrix-vector product using a triangular packed matrix. - .. container:: section - :name: GUID-5785B6D6-DB9C-43FA-B98A-009D5E077A9D - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void tpmv(queue &exec_queue, uplo upper_lower, transpose trans, diag unit_nonunit, std::int64_t n, buffer &a, buffer &x, std::int64_t incx) ``tpmv`` supports the following precisions. @@ -37,21 +27,16 @@ tpmv .. container:: section - :name: GUID-A045480A-2EC1-4C73-A836-468324FCC85A .. rubric:: Description - :name: description :class: sectiontitle - The tpmv routines compute a matrix-vector product with a triangular + The ``tpmv`` routines compute a matrix-vector product with a triangular packed matrix. The operation is defined as - - - x <- op(A)*x @@ -69,36 +54,41 @@ tpmv ``x`` is a vector of length ``n``. +tpmv (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_nonunit, std::int64_t n, sycl::buffer &a, sycl::buffer &x, std::int64_t incx) + .. container:: section - :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. upper_lower - Specifies whether ``A`` is upper or lower triangular. See - :ref:`onemkl_datatypes` for more - details. + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. trans - Specifies op(``A``), the transposition operation applied to ``A``. - See - :ref:`onemkl_datatypes` for more - details. + Specifies op(``A``), the transposition operation applied to ``A``. See :ref:`onemkl_datatypes` for more details. unit_nonunit - Specifies whether the matrix ``A`` is unit triangular or not. See - :ref:`onemkl_datatypes` - for more details. + Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details. n @@ -124,11 +114,9 @@ tpmv .. container:: section - :name: GUID-180038D9-902F-4B20-AB6B-E38F2A6C83E4 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -136,15 +124,100 @@ tpmv Buffer holding the updated vector ``x``. -.. container:: familylinks +tpmv (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-2-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_nonunit, std::int64_t n, const T *a, T *x, std::int64_t incx, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + upper_lower + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. + + trans + Specifies op(``A``), the transposition operation applied to + ``A``. See :ref:`onemkl_datatypes` for more details. + + + unit_nonunit + Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details. + + + n + Numbers of rows and columns of ``A``. Must be at least zero. + + + a + Pointer to input matrix ``A``. The array holding input matrix + ``A`` must have size at least (``n``\ \*(``n``\ +1))/2. See + `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + x + Pointer to input vector ``x``. The array holding input vector + ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)). + See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + x + Pointer to the updated vector ``x``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-2-routines` diff --git a/docs/domains/blas/tpsv.rst b/docs/domains/blas/tpsv.rst index 0ec419bd0..3bd98ad73 100644 --- a/docs/domains/blas/tpsv.rst +++ b/docs/domains/blas/tpsv.rst @@ -1,4 +1,4 @@ -.. _tpsv: +.. _onemkl_blas_tpsv: tpsv ==== @@ -11,16 +11,6 @@ tpsv triangular packed matrix. - .. container:: section - :name: GUID-230CF8CA-B38D-4CB6-9917-029FEF53EBED - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void tpsv(queue &exec_queue, uplo upper_lower, transpose trans, diag unit_nonunit, std::int64_t n, std::int64_t k, buffer &a, buffer &x, std::int64_t incx) ``tpsv`` supports the following precisions. @@ -38,22 +28,17 @@ tpsv .. container:: section - :name: GUID-7AD9F8E2-1343-4A6D-8C6A-F68D934292B7 .. rubric:: Description - :name: description :class: sectiontitle - The tpsv routines solve a system of linear equations whose + The ``tpsv`` routines solve a system of linear equations whose coefficients are in a triangular packed matrix. The operation is defined as - - - op(A)*x = b @@ -71,36 +56,40 @@ tpsv ``b`` and ``x`` are vectors of length ``n``. +tpsv (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_nonunit, std::int64_t n, std::int64_t k, sycl::buffer &a, sycl::buffer &x, std::int64_t incx) .. container:: section - :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. upper_lower - Specifies whether ``A`` is upper or lower triangular. See - :ref:`onemkl_datatypes` for more - details. + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. trans - Specifies op(``A``), the transposition operation applied to ``A``. - See - :ref:`onemkl_datatypes` for more - details. + Specifies op(``A``), the transposition operation applied to ``A``. See :ref:`onemkl_datatypes` for more details. unit_nonunit - Specifies whether the matrix ``A`` is unit triangular or not. See - :ref:`onemkl_datatypes` - for more details. + Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details. n @@ -127,11 +116,9 @@ tpsv .. container:: section - :name: GUID-F515C77C-1E84-424B-A00A-874ACBEFBF9E .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -139,15 +126,101 @@ tpsv Buffer holding the solution vector ``x``. -.. container:: familylinks +tpsv (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-2-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_nonunit, std::int64_t n, std::int64_t k, const T *a, T *x, std::int64_t incx, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + upper_lower + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. + + trans + Specifies op(``A``), the transposition operation applied to + ``A``. See :ref:`onemkl_datatypes` for more details. + + + unit_nonunit + Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details. + + + n + Numbers of rows and columns of ``A``. Must be at least zero. + + + a + Pointer to input matrix ``A``. The array holding input matrix + ``A`` must have size at least (``n``\ \*(``n``\ +1))/2. See + `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + x + Pointer to the ``n``-element right-hand side vector ``b``. The + array holding the ``n``-element right-hand side vector ``b`` + must be of size at least (1 + (``n`` - 1)*abs(``incx``)). See + `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + x + Pointer to the solution vector ``x``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-2-routines` diff --git a/docs/domains/blas/trmm.rst b/docs/domains/blas/trmm.rst index 2dbbc85b0..0c61ec772 100644 --- a/docs/domains/blas/trmm.rst +++ b/docs/domains/blas/trmm.rst @@ -1,4 +1,4 @@ -.. _trmm: +.. _onemkl_blas_trmm: trmm ==== @@ -11,18 +11,8 @@ trmm and one input matrix is general. - .. container:: section - :name: GUID-15B16EFC-8B31-4459-88DC-A8C5EF6C9932 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - .. cpp:function:: void trmm(queue &exec_queue, uplo upper_lower, transpose transa, diag unit_diag, std::int64_t m, std::int64_t n, T alpha, buffer &a, std::int64_t lda, buffer &b, std::int64_t ldb) - - trmm supports the following precisions. + ``trmm`` supports the following precisions. .. list-table:: @@ -35,18 +25,14 @@ trmm * - ``std::complex`` - - .. container:: section - :name: GUID-E1AAECF3-E29D-411F-B052-2F2E8080F3A1 .. rubric:: Description - :name: description :class: sectiontitle - The trmm routines compute a scalar-matrix-matrix product where one of + The ``trmm`` routines compute a scalar-matrix-matrix product where one of the matrices in the multiplication is triangular. The argument ``left_right`` determines if the triangular matrix, ``A``, is on the left of the multiplication (``left_right`` = ``side::left``) or on @@ -54,8 +40,6 @@ trmm ``left_right``. The operation is defined as - - B <- alpha*op(A)*B @@ -63,9 +47,6 @@ trmm or - - - B <- alpha*B*op(A) @@ -86,45 +67,46 @@ trmm ``n`` x ``n``, depending on ``left_right``. +trmm (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::trmm(sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag, std::int64_t m, std::int64_t n, T alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &b, std::int64_t ldb) .. container:: section - :name: GUID-DE8B0FD7-11E3-42BC-99ED-3A07040FA6CB .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. left_right Specifies whether ``A`` is on the left side of the multiplication - (``side::left``) or on the right side (``side::right``). See - :ref:`onemkl_datatypes` for more - details. + (``side::left``) or on the right side (``side::right``). See :ref:`onemkl_datatypes` for more details. uplo - Specifies whether the matrix ``A`` is upper or lower triangular. - See - :ref:`onemkl_datatypes` for more - details. + Specifies whether the matrix ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. trans - Specifies op(``A``), the transposition operation applied to ``A``. - See - :ref:`onemkl_datatypes` for more - details. + Specifies op(``A``), the transposition operation applied to ``A``. See :ref:`onemkl_datatypes` for more details. unit_diag Specifies whether ``A`` is assumed to be unit triangular (all - diagonal elements are 1). See - :ref:`onemkl_datatypes` for more - details. + diagonal elements are 1). See :ref:`onemkl_datatypes` for more details. m @@ -168,11 +150,9 @@ trmm .. container:: section - :name: GUID-1F1FF9D8-3833-4C9E-9CAC-53BA1791DCF1 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -182,11 +162,9 @@ trmm .. container:: section - :name: EXAMPLE_5EF48B8A07D849EA84A74FE22F0D5B24 .. rubric:: Notes - :name: notes :class: sectiontitle @@ -194,15 +172,137 @@ trmm not need to be initialized at entry. -.. container:: familylinks +trmm (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-3-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::trmm(sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag, std::int64_t m, std::int64_t n, T alpha, const T* a, std::int64_t lda, T* b, std::int64_t ldb, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + left_right + Specifies whether ``A`` is on the left side of the + multiplication (``side::left``) or on the right side + (``side::right``). See :ref:`onemkl_datatypes` for more details. + + + uplo + Specifies whether the matrix ``A`` is upper or lower + triangular. See :ref:`onemkl_datatypes` for more details. + + + trans + Specifies op(``A``), the transposition operation applied to + ``A``. See :ref:`onemkl_datatypes` for more details. + + unit_diag + Specifies whether ``A`` is assumed to be unit triangular (all + diagonal elements are 1). See :ref:`onemkl_datatypes` for more details. + + + m + Specifies the number of rows of ``B``. The value of ``m`` must + be at least zero. + + n + Specifies the number of columns of ``B``. The value of ``n`` + must be at least zero. + + + alpha + Scaling factor for the matrix-matrix product. + + + a + Pointer to input matrix ``A``. Must have size at least + ``lda``\ \*\ ``m`` if ``left_right`` = ``side::left``, or + ``lda``\ \*\ ``n`` if ``left_right`` = ``side::right``. See + `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + lda + Leading dimension of ``A``. Must be at least ``m`` if + ``left_right`` = ``side::left``, and at least ``n`` if + ``left_right`` = ``side::right``. Must be positive. + + + b + Pointer to input/output matrix ``B``. Must have size at least + ``ldb``\ \*\ ``n``. See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + ldb + Leading dimension of ``B``. Must be at least ``m`` and + positive. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + b + Pointer to the output matrix, overwritten by + ``alpha``\ \*op(``A``)\*\ ``B`` or + ``alpha``\ \*\ ``B``\ \*op(``A``). + + + .. container:: section + + + .. rubric:: Notes + :class: sectiontitle + + + If ``alpha`` = 0, matrix ``B`` is set to zero, and ``A`` and ``B`` + do not need to be initialized at entry. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-3-routines` diff --git a/docs/domains/blas/trmv.rst b/docs/domains/blas/trmv.rst index 14476e1e8..f015007ee 100644 --- a/docs/domains/blas/trmv.rst +++ b/docs/domains/blas/trmv.rst @@ -1,4 +1,4 @@ -.. _trmv: +.. _onemkl_blas_trmv: trmv ==== @@ -10,16 +10,6 @@ trmv Computes a matrix-vector product using a triangular matrix. - .. container:: section - :name: GUID-15041079-C2F5-4D3C-85C2-262E184F7FFE - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void trmv(queue &exec_queue, uplo upper_lower, transpose trans, diag unit_nonunit, std::int64_t n, buffer &a, std::int64_t lda, buffer &x, std::int64_t incx) ``trmv`` supports the following precisions. @@ -37,21 +27,16 @@ trmv .. container:: section - :name: GUID-420DC613-E11B-48A8-B73F-55B55EBFC3B7 .. rubric:: Description - :name: description :class: sectiontitle - The trmv routines compute a matrix-vector product with a triangular + The ``trmv`` routines compute a matrix-vector product with a triangular matrix. The operation is defined - - - x <- op(A)*x @@ -69,36 +54,40 @@ trmv ``x`` is a vector of length ``n``. +trmv (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_nonunit, std::int64_t n, sycl::buffer &a, std::int64_t lda, sycl::buffer &x, std::int64_t incx) .. container:: section - :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. upper_lower - Specifies whether ``A`` is upper or lower triangular. See - :ref:`onemkl_datatypes` for more - details. + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. trans - Specifies op(``A``), the transposition operation applied to ``A``. - See - :ref:`onemkl_datatypes` for more - details. + Specifies op(``A``), the transposition operation applied to ``A``. See :ref:`onemkl_datatypes` for more details. unit_nonunit - Specifies whether the matrix ``A`` is unit triangular or not. See - :ref:`onemkl_datatypes` - for more details. + Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details. n @@ -129,11 +118,9 @@ trmv .. container:: section - :name: GUID-7BF1D5C9-EB8C-4BD6-B0E7-A66DAC3221F9 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -141,15 +128,105 @@ trmv Buffer holding the updated vector ``x``. -.. container:: familylinks +trmv (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-2-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_nonunit, std::int64_t n, const T *a, std::int64_t lda, T *x, std::int64_t incx, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + upper_lower + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. + + + trans + Specifies op(``A``), the transposition operation applied to + ``A``. See :ref:`onemkl_datatypes` for more details. + + unit_nonunit + Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details. + + + n + Numbers of rows and columns of ``A``. Must be at least zero. + + + a + Pointer to input matrix ``A``. The array holding input matrix + ``A`` must have size at least ``lda``\ \*\ ``n``. See `Matrix + and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + lda + Leading dimension of matrix ``A``. Must be at least ``n``, and + positive. + + + x + Pointer to input vector ``x``. The array holding input vector + ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)). + See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + x + Pointer to the updated vector ``x``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-2-routines` diff --git a/docs/domains/blas/trsm.rst b/docs/domains/blas/trsm.rst index 958d231db..ec5e0ede5 100644 --- a/docs/domains/blas/trsm.rst +++ b/docs/domains/blas/trsm.rst @@ -1,4 +1,4 @@ -.. _trsm: +.. _onemkl_blas_trsm: trsm ==== @@ -10,18 +10,8 @@ trsm Solves a triangular matrix equation (forward or backward solve). - .. container:: section - :name: GUID-6F8E0E22-B30A-4825-B508-CEDE0CAC8B90 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - .. cpp:function:: void trsm(queue &exec_queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, std::int64_t m, std::int64_t n, T alpha, buffer &a, std::int64_t lda, buffer &b, std::int64_t ldb) - - trsm supports the following precisions. + ``trsm`` supports the following precisions. .. list-table:: @@ -37,18 +27,14 @@ trsm .. container:: section - :name: GUID-AE6CFEF4-4058-49C3-BABC-2B05D6594555 .. rubric:: Description - :name: description :class: sectiontitle - The trsm routines solve one of the following matrix equations: - + The ``trsm`` routines solve one of the following matrix equations: - op(A)*X = alpha*B, @@ -57,9 +43,6 @@ trsm or - - - X*op(A) = alpha*B, @@ -84,45 +67,46 @@ trsm is overwritten by the solution matrix ``X``. +trsm (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, std::int64_t m, std::int64_t n, T alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &b, std::int64_t ldb) .. container:: section - :name: GUID-0BBDCB60-8CDE-4EBD-BDE5-F7688B4B29F4 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. left_right Specifies whether ``A`` multiplies ``X`` on the left - (``side::left``) or on the right (``side::right``). See - :ref:`onemkl_datatypes` for more - details. + (``side::left``) or on the right (``side::right``). See :ref:`onemkl_datatypes` for more details. uplo - Specifies whether the matrix ``A`` is upper or lower triangular. - See - :ref:`onemkl_datatypes` for more - details. + Specifies whether the matrix ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. trans - Specifies op(``A``), the transposition operation applied to ``A``. - See - :ref:`onemkl_datatypes` for more - details. + Specifies op(``A``), the transposition operation applied to ``A``. See :ref:`onemkl_datatypes` for more details. unit_diag Specifies whether ``A`` is assumed to be unit triangular (all - diagonal elements are 1). See - :ref:`onemkl_datatypes` for more - details. + diagonal elements are 1). See :ref:`onemkl_datatypes` for more details. m @@ -166,11 +150,9 @@ trsm .. container:: section - :name: GUID-7AC6C3B9-7A31-4E0B-B770-FD607E7F9BE5 .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -179,11 +161,9 @@ trsm .. container:: section - :name: EXAMPLE_5EF48B8A07D849EA84A74FE22F0D5B24 .. rubric:: Notes - :name: notes :class: sectiontitle @@ -191,15 +171,136 @@ trsm not need to be initialized at entry. -.. container:: familylinks +trsm (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-3-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, std::int64_t m, std::int64_t n, T alpha, const T* a, std::int64_t lda, T* b, std::int64_t ldb, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + left_right + Specifies whether ``A`` multiplies ``X`` on the left + (``side::left``) or on the right (``side::right``). See :ref:`onemkl_datatypes` for more details. + + + uplo + Specifies whether the matrix ``A`` is upper or lower + triangular. See :ref:`onemkl_datatypes` for more details. + + + transa + Specifies op(``A``), the transposition operation applied to + ``A``. See :ref:`onemkl_datatypes` for more details. + + + unit_diag + Specifies whether ``A`` is assumed to be unit triangular (all + diagonal elements are 1). See :ref:`onemkl_datatypes` for more details. + + + m + Specifies the number of rows of ``B``. The value of ``m`` must + be at least zero. + + n + Specifies the number of columns of ``B``. The value of ``n`` + must be at least zero. + + + alpha + Scaling factor for the solution. + + + a + Pointer to input matrix ``A``. Must have size at least + ``lda``\ \*\ ``m`` if ``left_right`` = ``side::left``, or + ``lda``\ \*\ ``n`` if ``left_right`` = ``side::right``. See + `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + lda + Leading dimension of ``A``. Must be at least ``m`` if + ``left_right`` = ``side::left``, and at least ``n`` if + ``left_right`` = ``side::right``. Must be positive. + + + b + Pointer to input/output matrix ``B``. Must have size at least + ``ldb``\ \*\ ``n``. See `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + ldb + Leading dimension of ``B``. Must be at least ``m`` and + positive. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + b + Pointer to the output matrix. Overwritten by the solution + matrix ``X``. + + + .. container:: section + + + .. rubric:: Notes + :class: sectiontitle + + + If ``alpha`` = 0, matrix ``B`` is set to zero, and ``A`` and ``B`` + do not need to be initialized at entry. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-3-routines` diff --git a/docs/domains/blas/trsm_batch.rst b/docs/domains/blas/trsm_batch.rst index 0cf82ce21..c6710ee74 100644 --- a/docs/domains/blas/trsm_batch.rst +++ b/docs/domains/blas/trsm_batch.rst @@ -1,4 +1,4 @@ -.. _trsm_batch: +.. _onemkl_blas_trsm_batch: trsm_batch ========== @@ -6,28 +6,10 @@ trsm_batch .. container:: - - Computes groups of matrix-matrix product with general matrices. - - - .. container:: section - :name: GUID-6F8E0E22-B30A-4825-B508-CEDE0CAC8B90 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - **Group API** - - - .. cpp:function:: void trsm_batch(queue &exec_queue, buffer &left_right_array, buffer &upper_lower_array, buffer &trans_array, buffer &unit_diag_array, buffer &m_array, buffer &n_array, buffer &alpha_array, buffer &a_array, buffer &lda_array, buffer &b_array, buffer ldb_array, std::int64_t group_count, buffer &group_size_array) - - **Strided API** - - - .. cpp:function:: void trsm_batch(queue &exec_queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, T alpha, buffer &a, std::int64_t lda, std::int64_t stridea, buffer &b, std::int64_t ldb, std::int64_t strideb, std::int64_t batch_size) + The ``trsm_batch`` routines are batched versions of `trsm `__, performing + multiple ``trsm`` operations in a single call. Each ``trsm`` + solves an equation of the form op(A) \* X = alpha \* B or X \* op(A) = alpha \* B. + ``trsm_batch`` supports the following precisions. @@ -42,60 +24,28 @@ trsm_batch * - ``std::complex`` - +trsm_batch (Buffer Version) +--------------------------- .. container:: section - :name: GUID-AE6CFEF4-4058-49C3-BABC-2B05D6594555 .. rubric:: Description - :name: description :class: sectiontitle - - The trsm_batch routines solve a series of equations of the form op(A) - \* X = alpha \* B or X \* op(A) = alpha \* B. They are similar to the - trsm routine counterparts, but the trsm_batch routines solve linear - equations with groups of matrices. The groups contain matrices with - the same parameters. - - - For the group API, the operation is defined as - - - :: - - - offa = 0, offb = 0 - for i = 0 … group_count – 1 - left_right, uplo, trans, unit_diag, m, n, lda, ldb, alpha and group_size at position i in left_right_array, uplo_array, trans_array, unit_diag_array, m_array, n_array, lda_array, ldb_array, alpha_array and group_size_array - sizea = left_right == onemkl::side::L ? lda * m : lda * n; - sizeb = ldb * n; - for j = 0 … group_size – 1 - A and B are matrices of size sizea and sizeb at offset offa and offb in a and b. - if (left_right == onemkl::side::L) then - computes X such that op(A) * X = alpha * B - else - computes X such that X * op(A) = alpha * B - end if - B := X - offa += sizea, offb += sizeb - end for - end for - - - For the strided API, the operation is defined as - + The buffer version of ``trsm_batch`` supports only the strided API. + + The strided API operation is defined as :: for i = 0 … batch_size – 1 A and B are matrices at offset i * stridea and i * strideb in a and b. - if (left_right == onemkl::side::L) then - computes X such that op(A) * X = alpha * B + if (left_right == onemkl::side::left) then + compute X such that op(A) * X = alpha * B else - computes X such that X * op(A) = alpha * B + compute X such that X * op(A) = alpha * B end if B := X end for @@ -104,215 +54,69 @@ trsm_batch where: - - op(``A``) is one of op(``A``) = ``A``, or op(A) = ``A``\ :sup:`T`, - or op(``A``) = ``A``\ :sup:`H` + op(``A``) is one of op(``A``) = ``A``, or op(A) = ``A``\ :sup:`T`, + or op(``A``) = ``A``\ :sup:`H` - - alpha is a scalar + ``alpha`` is a scalar - - ``A`` is a triangular matrix + ``A`` is a triangular matrix - - ``B`` and ``X`` are ``m`` x ``n`` general matrices - - - - The a and b buffers contains all the input matrices. The stride - between matrices is either given by the exact size of the matrix - (for the group API) or by the stride parameter. The total number - of matrices in a and b is given by the - - |image0| - - for the strided - API. + ``B`` and ``X`` are ``m`` x ``n`` general matrices ``A`` is either ``m`` x ``m`` or ``n`` x ``n``,depending on whether - it multiplies ``X`` on the leftor right. On return, the matrix ``B`` - is overwrittenby the solution matrix ``X``. - - -.. container:: section - :name: GUID-863264A0-4CE9-495F-A617-102E46D7A41A - - - .. rubric:: Input Parameters - Group API - :name: input-parameters---group-api - :class: sectiontitle - - - left_right_array - Buffer holding ``group_count onemkl::side`` value. - - - For the group ``i``, ``left_right`` is the ``i``\ th element in - the left_right_array buffer and specifies whether ``A`` multiplies - ``X`` on the left (``side::left``) or on the right - (``side::right``). See - :ref:`onemkl_datatypes` for more - details. - - - uplo_array - Buffer holding ``group_count onemkl::uplo`` value. - - - For the group ``i``, ``uplo`` is the ``i``\ th element in the - uplo_array buffer and specifies whether ``A`` is upper or lower - triangular. See - :ref:`onemkl_datatypes` for more - details. - - - trans_array - Buffer holding ``group_count onemkl::transpose`` value. - - - For the group ``i``, ``trans`` is the ``i``\ th element in the - trans_array buffer and specifies the form of ``op``\ (``A``) used - in the matrix multiplication. See - :ref:`onemkl_datatypes` for more - details. - - - unit_diag__array - Buffer holding ``group_count onemkl::diag`` value. - - - For the group ``i``, ``unit_diag`` is the ``i``\ th element in the - unit_diag_array buffer and specifies whether ``A`` is assumed to - be unit triangular (all diagonal elements are 1). See - :ref:`onemkl_datatypes` for more - details. - + it multiplies ``X`` on the left or right. On return, the matrix ``B`` + is overwritten by the solution matrix ``X``. - m_array - Buffer holding ``group_count`` integer. For the group ``i``, ``m`` - is the ``i``\ th element in the m_array buffer and specifies the - number of rows of ``B``. Must be at least zero. - - - n_array - Buffer holding ``group_count`` integer. For the group ``i``, ``n`` - is the ``i``\ th element in the n_array buffer and specifies the - number of columns of ``B``. Must be at least zero. - - - alpha_array - Buffer holding ``group_count`` scalar element. For the group - ``i``, ``alpha`` is the ``i``\ th element in the alpha_array - buffer and specifies the scaling factor for the matrix-matrix - product. - - - a - Buffer holding the input matrix ``A``. The total size of the - buffer ``a`` must be at least the sum of the sizes of all the - matricies ``A``. That is, - - - |image1| - - - where - ``sizeai = lda_array[i] * (left_right == onemkl::side::L ? m : n)`` - - - See `Matrix - Storage <../matrix-storage.html>`__ for - more details. - - - lda_array - Buffer holding ``group_count`` integer. For the group ``i``, - ``lda`` is the ``i``\ th element in the lda_array buffer and - specifies the leading dimension of ``A``. Must be at least ``m`` - if ``A`` is not transposed, and at least ``k`` if ``A`` is - transposed. Must be positive. - - - b - Buffer holding the input matrix ``B``. The total size of the - buffer ``b`` must be at least the sum of the sizes of all the - matricies ``B``. That is, - - - |image2| - - - See `Matrix - Storage <../matrix-storage.html>`__ for - more details. + The a and b buffers contain all the input matrices. The stride + between matrices is given by the stride parameter. The total number + of matrices in a and b buffers are given by the ``batch_size`` parameter. + - ldb_array - Buffer holding ``group_count`` integer. For the group ``i``, - ``ldb`` is the ``i``\ th element in the ldb_array buffer and - specifies the leading dimension of ``B``. Must be at least ``n``. - Must be positive. + **Strided API** + .. container:: section - group_count - Specifies the number of groups. Must be at least 0. + .. rubric:: Syntax + :class: sectiontitle - group_size_array - Buffer holding the group_count integer. For the group ``i``, - ``ldb`` is the ``i``\ th element in the group_size_array buffer - specifies the number of matrix multiply operations in - group\ ``i``. Each element in group_size_array must be at least 0. + .. cpp:function:: void onemkl::blas::trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, T alpha, sycl::buffer &a, std::int64_t lda, std::int64_t stridea, sycl::buffer &b, std::int64_t ldb, std::int64_t strideb, std::int64_t batch_size) .. container:: section - :name: GUID-1E4953E6-F7B1-4FEE-BA5A-8C4BD51DC700 - .. rubric:: Output Parameters - Group API - :name: output-parameters---group-api + .. rubric:: Input Parameters :class: sectiontitle - b - Output buffer, overwritten by the ``total_batch_count`` solution - matrices ``X``. - - -.. container:: section - :name: GUID-D067773A-45A3-4D24-B10A-46E27834947E - - - .. rubric:: Input Parameters - Strided API - :name: input-parameters---strided-api - :class: sectiontitle + queue + The queue where the routine should be executed. left_right Specifies whether the matrices ``A`` multiply ``X`` on the left - (``side::left``) or on the right (``side::right``). See - :ref:`onemkl_datatypes` for more - details. + (``side::left``) or on the right (``side::right``). See :ref:`onemkl_datatypes` for more details. - uplo + upper_lower Specifies whether the matrices ``A`` are upper or lower - triangular. See - :ref:`onemkl_datatypes` for more - details. + triangular. See :ref:`onemkl_datatypes` for more details. trans Specifies ``op(A)``, the transposition operation applied to the - matrices ``A``. See - :ref:`onemkl_datatypes` for more - details. + matrices ``A``. See :ref:`onemkl_datatypes` for more details. unit_diag Specifies whether the matrices ``A`` are assumed to be unit - triangular (all diagonal elements are 1.). See - :ref:`onemkl_datatypes` for more - details. + triangular (all diagonal elements are 1). See :ref:`onemkl_datatypes` for more details. m @@ -328,59 +132,30 @@ trsm_batch a - Buffer holding the input matrices ``A``. Must have size at least - ``stridea*batch_size``. + Buffer holding the input matrices ``A`` with size ``stridea*batch_size``. lda Leading dimension of the matrices ``A``. Must be at least ``m`` if - left_right = ``side::left``, and at least ``n`` if left_right = + ``left_right`` = ``side::left``, and at least ``n`` if ``left_right`` = ``side::right``. Must be positive. stridea - Stride between the different ``A`` matrices. - - - If left_right = ``side::left``, the matrices ``A`` are - ``m``-by-``m`` matrices, so stridea must be at least lda\*\ ``m``. - - - If left_right = ``side::right``, the matrices ``A`` are - ``n``-by-``n`` matrices, so stridea must be at least lda\*\ ``n``. + Stride between different ``A`` matrices. b - Buffer holding the input matrices ``B``. Must have size at least - ``strideb*batch_size``. + Buffer holding the input matrices ``B`` with size ``strideb*batch_size``. ldb - Leading dimension of the matrices ``B``. Must be at least ``m`` - and must be positive. + Leading dimension of the matrices ``B``. Must be at least ``m``. + Must be positive. strideb - Stride between the different ``B`` matrices. Must be at least - ldb\*\ ``n``. - - - beta - Scaling factor for the matrices ``C``. - - - c - Buffer holding input/output matrices ``C``. Must have size at - least ``stridec*batch_size``. - - - ldc - Leading dimension of ``C``. Must be positive and at least ``m``. - - - stridec - Stride between the different ``C`` matrices. Must be at least - ``ldc*n``. + Stride between different ``B`` matrices. batch_size @@ -388,11 +163,9 @@ trsm_batch .. container:: section - :name: GUID-98C3DE17-4F5F-41A1-B431-48148153ABBA - .. rubric:: Output Parameters - Strided API - :name: output-parameters---strided-api + .. rubric:: Output Parameters :class: sectiontitle @@ -402,16 +175,15 @@ trsm_batch .. container:: section - :name: GUID-AC72653A-4AC8-4B9D-B7A9-13A725AA19BF .. rubric:: Notes - :name: notes :class: sectiontitle - If ``alpha`` = 0, matrix ``B`` is set to zero, and the matrices ``A`` - and ``B`` do not need to be initialized before calling trsm_batch. + If ``alpha`` = 0, matrix ``B`` is set to zero and the matrices ``A`` + and ``B`` do not need to be initialized before calling ``trsm_batch``. + .. container:: familylinks @@ -423,14 +195,3 @@ trsm_batch **Parent topic:** :ref:`blas-like-extensions` - -.. container:: - - -.. |image0| image:: ../equations/GUID-D352DB8F-BC76-4A5E-A7CA-5B4CAAA90ee1.png - :class: img-middle -.. |image1| image:: ../equations/GUID-D352DB8F-BC76-4A5E-A7CA-5B4CAAA90ee2.png - :class: img-middle -.. |image2| image:: ../equations/GUID-D352DB8F-BC76-4A5E-A7CA-5B4CAAA90ee3.png - :class: img-middle - diff --git a/docs/domains/blas/trsv.rst b/docs/domains/blas/trsv.rst index e1dba6e43..8662d3197 100644 --- a/docs/domains/blas/trsv.rst +++ b/docs/domains/blas/trsv.rst @@ -1,4 +1,4 @@ -.. _trsv: +.. _onemkl_blas_trsv: trsv ==== @@ -11,16 +11,6 @@ trsv triangular matrix. - .. container:: section - :name: GUID-9BA4C1B6-479B-41B1-BCA8-7826F40DA952 - - - .. rubric:: Syntax - :name: syntax - :class: sectiontitle - - - .. cpp:function:: void trsv(queue &exec_queue, uplo upper_lower, transpose trans, diag unit_nonunit, std::int64_t n, std::int64_t k, buffer &a, std::int64_t lda, buffer &x, std::int64_t incx) ``trsv`` supports the following precisions. @@ -38,20 +28,16 @@ trsv .. container:: section - :name: GUID-D500B67B-5DD6-4471-B0BD-53FD9A3C7BF2 .. rubric:: Description - :name: description :class: sectiontitle - The trsv routines compute a matrix-vector product with a triangular + The ``trsv`` routines compute a matrix-vector product with a triangular band matrix. The operation is defined as - - op(A)*x = b @@ -70,36 +56,41 @@ trsv ``b`` and ``x`` are vectors of length ``n``. +trsv (Buffer Version) +--------------------- + +.. container:: + + .. container:: section + + + .. rubric:: Syntax + :class: sectiontitle + + + .. cpp:function:: void onemkl::blas::trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_nonunit, std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, sycl::buffer &x, std::int64_t incx) + .. container:: section - :name: GUID-E1436726-01FE-4206-871E-B905F59A96B4 .. rubric:: Input Parameters - :name: input-parameters :class: sectiontitle - exec_queue + queue The queue where the routine should be executed. upper_lower - Specifies whether ``A`` is upper or lower triangular. See - :ref:`onemkl_datatypes` for more - details. + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. trans - Specifies op(``A``), the transposition operation applied to ``A``. - See - :ref:`onemkl_datatypes` for more - details. + Specifies op(``A``), the transposition operation applied to ``A``. See :ref:`onemkl_datatypes` for more details. unit_nonunit - Specifies whether the matrix ``A`` is unit triangular or not. See - :ref:`onemkl_datatypes` - for more details. + Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details. n @@ -131,11 +122,9 @@ trsv .. container:: section - :name: GUID-7E0AF44F-2D83-41A3-A58E-50400ECDBD9A .. rubric:: Output Parameters - :name: output-parameters :class: sectiontitle @@ -143,15 +132,105 @@ trsv Buffer holding the solution vector ``x``. -.. container:: familylinks +trsv (USM Version) +------------------ +.. container:: - .. container:: parentlink + .. container:: section - **Parent topic:** :ref:`blas-level-2-routines` - + .. rubric:: Syntax + :class: sectiontitle -.. container:: + .. container:: dlsyntaxpara + + + .. cpp:function:: sycl::event onemkl::blas::trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_nonunit, std::int64_t n, std::int64_t k, const T *a, std::int64_t lda, T *x, std::int64_t incx, const sycl::vector_class &dependencies = {}) + .. container:: section + + + .. rubric:: Input Parameters + :class: sectiontitle + + + queue + The queue where the routine should be executed. + + + upper_lower + Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details. + + trans + Specifies op(``A``), the transposition operation applied to + ``A``. See :ref:`onemkl_datatypes` for more details. + + unit_nonunit + Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details. + + + n + Numbers of rows and columns of ``A``. Must be at least zero. + + + a + Pointer to input matrix ``A``. The array holding input matrix + ``A`` must have size at least ``lda``\ \*\ ``n``. See `Matrix + and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + lda + Leading dimension of matrix ``A``. Must be at least ``n``, and + positive. + + + x + Pointer to the ``n``-element right-hand side vector ``b``. The + array holding the ``n``-element right-hand side vector ``b`` + must be of size at least (1 + (``n`` - 1)*abs(``incx``)). See + `Matrix and Vector + Storage <../matrix-storage.html>`__ for + more details. + + + incx + Stride of vector ``x``. + + + dependencies + List of events to wait for before starting computation, if any. + If omitted, defaults to no dependencies. + + + .. container:: section + + + .. rubric:: Output Parameters + :class: sectiontitle + + + x + Pointer to the solution vector ``x``. + + + .. container:: section + + + .. rubric:: Return Values + :class: sectiontitle + + + Output event to wait on to ensure computation is complete. + + +.. container:: familylinks + + + .. container:: parentlink + + + **Parent topic:** :ref:`blas-level-2-routines` diff --git a/include/onemkl/blas/blas.hpp b/include/onemkl/blas/blas.hpp index d06042049..acc1111d6 100644 --- a/include/onemkl/blas/blas.hpp +++ b/include/onemkl/blas/blas.hpp @@ -45,6 +45,8 @@ namespace onemkl { namespace blas { +// Buffer APIs + static inline void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer &result) { @@ -298,76 +300,6 @@ static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose tran gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -static inline void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, - cl::sycl::buffer &m, - cl::sycl::buffer &n, - cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, - cl::sycl::buffer &beta, cl::sycl::buffer &c, - cl::sycl::buffer &ldc, std::int64_t group_count, - cl::sycl::buffer &group_size) { - gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); - detail::gemm_batch(select_backend(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, - beta, c, ldc, group_count, group_size); - gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); -} - -static inline void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, cl::sycl::buffer &beta, - cl::sycl::buffer &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size) { - gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); - detail::gemm_batch(select_backend(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, - beta, c, ldc, group_count, group_size); - gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); -} - -static inline void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size) { - gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); - detail::gemm_batch(select_backend(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, - beta, c, ldc, group_count, group_size); - gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); -} - -static inline void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size) { - gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); - detail::gemm_batch(select_backend(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, - beta, c, ldc, group_count, group_size); - gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); -} - static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer &a, std::int64_t lda, @@ -1660,72 +1592,6 @@ static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lowe ldb); } -static inline void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, - cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, - cl::sycl::buffer &m, - cl::sycl::buffer &n, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, std::int64_t group_count, - cl::sycl::buffer &group_size) { - trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); - detail::trsm_batch(select_backend(queue), queue, left_right, upper_lower, trans, unit_diag, m, - n, alpha, a, lda, b, ldb, group_count, group_size); - trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); -} - -static inline void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { - trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); - detail::trsm_batch(select_backend(queue), queue, left_right, upper_lower, trans, unit_diag, m, - n, alpha, a, lda, b, ldb, group_count, group_size); - trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); -} - -static inline void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { - trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); - detail::trsm_batch(select_backend(queue), queue, left_right, upper_lower, trans, unit_diag, m, - n, alpha, a, lda, b, ldb, group_count, group_size); - trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); -} - -static inline void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { - trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); - detail::trsm_batch(select_backend(queue), queue, left_right, upper_lower, trans, unit_diag, m, - n, alpha, a, lda, b, ldb, group_count, group_size); - trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); -} - static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, cl::sycl::buffer &a, std::int64_t lda, @@ -1814,6 +1680,1939 @@ static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose tran trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } +// USM APIs + +static inline cl::sycl::event asum( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + float *result, const cl::sycl::vector_class &dependencies = {}) { + asum_precondition(queue, n, x, incx, result, dependencies); + auto done = detail::asum(select_backend(queue), queue, n, x, incx, result, dependencies); + asum_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +static inline cl::sycl::event asum( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + double *result, const cl::sycl::vector_class &dependencies = {}) { + asum_precondition(queue, n, x, incx, result, dependencies); + auto done = detail::asum(select_backend(queue), queue, n, x, incx, result, dependencies); + asum_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +static inline cl::sycl::event asum( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies = {}) { + asum_precondition(queue, n, x, incx, result, dependencies); + auto done = detail::asum(select_backend(queue), queue, n, x, incx, result, dependencies); + asum_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +static inline cl::sycl::event asum( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies = {}) { + asum_precondition(queue, n, x, incx, result, dependencies); + auto done = detail::asum(select_backend(queue), queue, n, x, incx, result, dependencies); + asum_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +static inline cl::sycl::event axpy( + cl::sycl::queue &queue, std::int64_t n, float alpha, const float *x, std::int64_t incx, + float *y, std::int64_t incy, const cl::sycl::vector_class &dependencies = {}) { + axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies); + auto done = + detail::axpy(select_backend(queue), queue, n, alpha, x, incx, y, incy, dependencies); + axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies); + return done; +} + +static inline cl::sycl::event axpy( + cl::sycl::queue &queue, std::int64_t n, double alpha, const double *x, std::int64_t incx, + double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}) { + axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies); + auto done = + detail::axpy(select_backend(queue), queue, n, alpha, x, incx, y, incy, dependencies); + axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies); + return done; +} + +static inline cl::sycl::event axpy( + cl::sycl::queue &queue, std::int64_t n, std::complex alpha, const std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}) { + axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies); + auto done = + detail::axpy(select_backend(queue), queue, n, alpha, x, incx, y, incy, dependencies); + axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies); + return done; +} + +static inline cl::sycl::event axpy( + cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}) { + axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies); + auto done = + detail::axpy(select_backend(queue), queue, n, alpha, x, incx, y, incy, dependencies); + axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies); + return done; +} + +static inline cl::sycl::event axpy_batch( + cl::sycl::queue &queue, std::int64_t *n, float *alpha, const float **x, std::int64_t *incx, + float **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}) { + axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + auto done = detail::axpy_batch(select_backend(queue), queue, n, alpha, x, incx, y, incy, + group_count, group_size, dependencies); + axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + return done; +} + +static inline cl::sycl::event axpy_batch( + cl::sycl::queue &queue, std::int64_t *n, double *alpha, const double **x, std::int64_t *incx, + double **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}) { + axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + auto done = detail::axpy_batch(select_backend(queue), queue, n, alpha, x, incx, y, incy, + group_count, group_size, dependencies); + axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + return done; +} + +static inline cl::sycl::event axpy_batch( + cl::sycl::queue &queue, std::int64_t *n, std::complex *alpha, + const std::complex **x, std::int64_t *incx, std::complex **y, std::int64_t *incy, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}) { + axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + auto done = detail::axpy_batch(select_backend(queue), queue, n, alpha, x, incx, y, incy, + group_count, group_size, dependencies); + axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + return done; +} + +static inline cl::sycl::event axpy_batch( + cl::sycl::queue &queue, std::int64_t *n, std::complex *alpha, + const std::complex **x, std::int64_t *incx, std::complex **y, + std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}) { + axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + auto done = detail::axpy_batch(select_backend(queue), queue, n, alpha, x, incx, y, incy, + group_count, group_size, dependencies); + axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + return done; +} + +static inline cl::sycl::event copy( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}) { + copy_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = detail::copy(select_backend(queue), queue, n, x, incx, y, incy, dependencies); + copy_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +static inline cl::sycl::event copy( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}) { + copy_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = detail::copy(select_backend(queue), queue, n, x, incx, y, incy, dependencies); + copy_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +static inline cl::sycl::event copy( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}) { + copy_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = detail::copy(select_backend(queue), queue, n, x, incx, y, incy, dependencies); + copy_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +static inline cl::sycl::event copy( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}) { + copy_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = detail::copy(select_backend(queue), queue, n, x, incx, y, incy, dependencies); + copy_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +static inline cl::sycl::event dot( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, const float *y, + std::int64_t incy, float *result, + const cl::sycl::vector_class &dependencies = {}) { + dot_precondition(queue, n, x, incx, y, incy, result, dependencies); + auto done = + detail::dot(select_backend(queue), queue, n, x, incx, y, incy, result, dependencies); + dot_postcondition(queue, n, x, incx, y, incy, result, dependencies); + return done; +} + +static inline cl::sycl::event dot( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, const double *y, + std::int64_t incy, double *result, + const cl::sycl::vector_class &dependencies = {}) { + dot_precondition(queue, n, x, incx, y, incy, result, dependencies); + auto done = + detail::dot(select_backend(queue), queue, n, x, incx, y, incy, result, dependencies); + dot_postcondition(queue, n, x, incx, y, incy, result, dependencies); + return done; +} + +static inline cl::sycl::event dot( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, const float *y, + std::int64_t incy, double *result, + const cl::sycl::vector_class &dependencies = {}) { + dot_precondition(queue, n, x, incx, y, incy, result, dependencies); + auto done = + detail::dot(select_backend(queue), queue, n, x, incx, y, incy, result, dependencies); + dot_postcondition(queue, n, x, incx, y, incy, result, dependencies); + return done; +} + +static inline cl::sycl::event dotc( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies = {}) { + dotc_precondition(queue, n, x, incx, y, incy, result, dependencies); + auto done = + detail::dotc(select_backend(queue), queue, n, x, incx, y, incy, result, dependencies); + dotc_postcondition(queue, n, x, incx, y, incy, result, dependencies); + return done; +} + +static inline cl::sycl::event dotc( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies = {}) { + dotc_precondition(queue, n, x, incx, y, incy, result, dependencies); + auto done = + detail::dotc(select_backend(queue), queue, n, x, incx, y, incy, result, dependencies); + dotc_postcondition(queue, n, x, incx, y, incy, result, dependencies); + return done; +} + +static inline cl::sycl::event dotu( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies = {}) { + dotu_precondition(queue, n, x, incx, y, incy, result, dependencies); + auto done = + detail::dotu(select_backend(queue), queue, n, x, incx, y, incy, result, dependencies); + dotu_postcondition(queue, n, x, incx, y, incy, result, dependencies); + return done; +} + +static inline cl::sycl::event dotu( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies = {}) { + dotu_precondition(queue, n, x, incx, y, incy, result, dependencies); + auto done = + detail::dotu(select_backend(queue), queue, n, x, incx, y, incy, result, dependencies); + dotu_postcondition(queue, n, x, incx, y, incy, result, dependencies); + return done; +} + +static inline cl::sycl::event gbmv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, float alpha, const float *a, std::int64_t lda, const float *x, + std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}) { + gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = detail::gbmv(select_backend(queue), queue, trans, m, n, kl, ku, alpha, a, lda, x, + incx, beta, y, incy, dependencies); + gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +static inline cl::sycl::event gbmv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, double alpha, const double *a, std::int64_t lda, const double *x, + std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}) { + gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = detail::gbmv(select_backend(queue), queue, trans, m, n, kl, ku, alpha, a, lda, x, + incx, beta, y, incy, dependencies); + gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +static inline cl::sycl::event gbmv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}) { + gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = detail::gbmv(select_backend(queue), queue, trans, m, n, kl, ku, alpha, a, lda, x, + incx, beta, y, incy, dependencies); + gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +static inline cl::sycl::event gbmv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}) { + gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = detail::gbmv(select_backend(queue), queue, trans, m, n, kl, ku, alpha, a, lda, x, + incx, beta, y, incy, dependencies); + gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +static inline cl::sycl::event gemm( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, + float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}) { + gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = detail::gemm(select_backend(queue), queue, transa, transb, m, n, k, alpha, a, lda, + b, ldb, beta, c, ldc, dependencies); + gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +static inline cl::sycl::event gemm( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *b, + std::int64_t ldb, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}) { + gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = detail::gemm(select_backend(queue), queue, transa, transb, m, n, k, alpha, a, lda, + b, ldb, beta, c, ldc, dependencies); + gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +static inline cl::sycl::event gemm( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}) { + gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = detail::gemm(select_backend(queue), queue, transa, transb, m, n, k, alpha, a, lda, + b, ldb, beta, c, ldc, dependencies); + gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +static inline cl::sycl::event gemm( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}) { + gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = detail::gemm(select_backend(queue), queue, transa, transb, m, n, k, alpha, a, lda, + b, ldb, beta, c, ldc, dependencies); + gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +static inline cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, float *alpha, const float **a, std::int64_t *lda, const float **b, + std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, std::int64_t group_count, + std::int64_t *group_size, const cl::sycl::vector_class &dependencies = {}) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + auto done = + detail::gemm_batch(select_backend(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, + ldb, beta, c, ldc, group_count, group_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + return done; +} + +static inline cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, double *alpha, const double **a, std::int64_t *lda, const double **b, + std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc, std::int64_t group_count, + std::int64_t *group_size, const cl::sycl::vector_class &dependencies = {}) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + auto done = + detail::gemm_batch(select_backend(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, + ldb, beta, c, ldc, group_count, group_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + return done; +} + +static inline cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, std::complex *alpha, const std::complex **a, std::int64_t *lda, + const std::complex **b, std::int64_t *ldb, std::complex *beta, + std::complex **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + auto done = + detail::gemm_batch(select_backend(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, + ldb, beta, c, ldc, group_count, group_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + return done; +} + +static inline cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, std::complex *alpha, const std::complex **a, std::int64_t *lda, + const std::complex **b, std::int64_t *ldb, std::complex *beta, + std::complex **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + auto done = + detail::gemm_batch(select_backend(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, + ldb, beta, c, ldc, group_count, group_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + return done; +} + +static inline cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const float *a, std::int64_t lda, std::int64_t stride_a, + const float *b, std::int64_t ldb, std::int64_t stride_b, float beta, float *c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const cl::sycl::vector_class &dependencies = {}) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + auto done = detail::gemm_batch(select_backend(queue), queue, transa, transb, m, n, k, alpha, a, + lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, + batch_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + return done; +} + +static inline cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, double alpha, const double *a, std::int64_t lda, std::int64_t stride_a, + const double *b, std::int64_t ldb, std::int64_t stride_b, double beta, double *c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const cl::sycl::vector_class &dependencies = {}) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + auto done = detail::gemm_batch(select_backend(queue), queue, transa, transb, m, n, k, alpha, a, + lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, + batch_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + return done; +} + +static inline cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + std::int64_t stride_a, const std::complex *b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, std::complex *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const cl::sycl::vector_class &dependencies = {}) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + auto done = detail::gemm_batch(select_backend(queue), queue, transa, transb, m, n, k, alpha, a, + lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, + batch_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + return done; +} + +static inline cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + std::int64_t stride_a, const std::complex *b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, std::complex *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const cl::sycl::vector_class &dependencies = {}) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + auto done = detail::gemm_batch(select_backend(queue), queue, transa, transb, m, n, k, alpha, a, + lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, + batch_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + return done; +} + +static inline cl::sycl::event gemmt( + cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, + float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}) { + gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + auto done = detail::gemmt(select_backend(queue), queue, upper_lower, transa, transb, n, k, + alpha, a, lda, b, ldb, beta, c, ldc, dependencies); + gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + return done; +} + +static inline cl::sycl::event gemmt( + cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *b, + std::int64_t ldb, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}) { + gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + auto done = detail::gemmt(select_backend(queue), queue, upper_lower, transa, transb, n, k, + alpha, a, lda, b, ldb, beta, c, ldc, dependencies); + gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + return done; +} + +static inline cl::sycl::event gemmt( + cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}) { + gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + auto done = detail::gemmt(select_backend(queue), queue, upper_lower, transa, transb, n, k, + alpha, a, lda, b, ldb, beta, c, ldc, dependencies); + gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + return done; +} + +static inline cl::sycl::event gemmt( + cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}) { + gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + auto done = detail::gemmt(select_backend(queue), queue, upper_lower, transa, transb, n, k, + alpha, a, lda, b, ldb, beta, c, ldc, dependencies); + gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + return done; +} + +static inline cl::sycl::event gemv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha, + const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}) { + gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = detail::gemv(select_backend(queue), queue, trans, m, n, alpha, a, lda, x, incx, + beta, y, incy, dependencies); + gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +static inline cl::sycl::event gemv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, + const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}) { + gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = detail::gemv(select_backend(queue), queue, trans, m, n, alpha, a, lda, x, incx, + beta, y, incy, dependencies); + gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +static inline cl::sycl::event gemv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}) { + gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = detail::gemv(select_backend(queue), queue, trans, m, n, alpha, a, lda, x, incx, + beta, y, incy, dependencies); + gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +static inline cl::sycl::event gemv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}) { + gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = detail::gemv(select_backend(queue), queue, trans, m, n, alpha, a, lda, x, incx, + beta, y, incy, dependencies); + gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +static inline cl::sycl::event ger( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}) { + ger_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = detail::ger(select_backend(queue), queue, m, n, alpha, x, incx, y, incy, a, lda, + dependencies); + ger_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +static inline cl::sycl::event ger( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, const double *x, + std::int64_t incx, const double *y, std::int64_t incy, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}) { + ger_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = detail::ger(select_backend(queue), queue, m, n, alpha, x, incx, y, incy, a, lda, + dependencies); + ger_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +static inline cl::sycl::event gerc( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}) { + gerc_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = detail::gerc(select_backend(queue), queue, m, n, alpha, x, incx, y, incy, a, lda, + dependencies); + gerc_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +static inline cl::sycl::event gerc( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}) { + gerc_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = detail::gerc(select_backend(queue), queue, m, n, alpha, x, incx, y, incy, a, lda, + dependencies); + gerc_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +static inline cl::sycl::event geru( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}) { + geru_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = detail::geru(select_backend(queue), queue, m, n, alpha, x, incx, y, incy, a, lda, + dependencies); + geru_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +static inline cl::sycl::event geru( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}) { + geru_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = detail::geru(select_backend(queue), queue, m, n, alpha, x, incx, y, incy, a, lda, + dependencies); + geru_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +static inline cl::sycl::event hbmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}) { + hbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = detail::hbmv(select_backend(queue), queue, upper_lower, n, k, alpha, a, lda, x, + incx, beta, y, incy, dependencies); + hbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +static inline cl::sycl::event hbmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}) { + hbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = detail::hbmv(select_backend(queue), queue, upper_lower, n, k, alpha, a, lda, x, + incx, beta, y, incy, dependencies); + hbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +static inline cl::sycl::event hemm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}) { + hemm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = detail::hemm(select_backend(queue), queue, left_right, upper_lower, m, n, alpha, a, + lda, b, ldb, beta, c, ldc, dependencies); + hemm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +static inline cl::sycl::event hemm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}) { + hemm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = detail::hemm(select_backend(queue), queue, left_right, upper_lower, m, n, alpha, a, + lda, b, ldb, beta, c, ldc, dependencies); + hemm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +static inline cl::sycl::event hemv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}) { + hemv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = detail::hemv(select_backend(queue), queue, upper_lower, n, alpha, a, lda, x, incx, + beta, y, incy, dependencies); + hemv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +static inline cl::sycl::event hemv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *x, + std::int64_t incx, std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}) { + hemv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = detail::hemv(select_backend(queue), queue, upper_lower, n, alpha, a, lda, x, incx, + beta, y, incy, dependencies); + hemv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +static inline cl::sycl::event her( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const std::complex *x, std::int64_t incx, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}) { + her_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + auto done = detail::her(select_backend(queue), queue, upper_lower, n, alpha, x, incx, a, lda, + dependencies); + her_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + return done; +} + +static inline cl::sycl::event her( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + const std::complex *x, std::int64_t incx, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}) { + her_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + auto done = detail::her(select_backend(queue), queue, upper_lower, n, alpha, x, incx, a, lda, + dependencies); + her_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + return done; +} + +static inline cl::sycl::event her2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}) { + her2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = detail::her2(select_backend(queue), queue, upper_lower, n, alpha, x, incx, y, incy, + a, lda, dependencies); + her2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +static inline cl::sycl::event her2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}) { + her2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = detail::her2(select_backend(queue), queue, upper_lower, n, alpha, x, incx, y, incy, + a, lda, dependencies); + her2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +static inline cl::sycl::event her2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, float beta, std::complex *c, + std::int64_t ldc, const cl::sycl::vector_class &dependencies = {}) { + her2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = detail::her2k(select_backend(queue), queue, upper_lower, trans, n, k, alpha, a, lda, + b, ldb, beta, c, ldc, dependencies); + her2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +static inline cl::sycl::event her2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, double beta, std::complex *c, + std::int64_t ldc, const cl::sycl::vector_class &dependencies = {}) { + her2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = detail::her2k(select_backend(queue), queue, upper_lower, trans, n, k, alpha, a, lda, + b, ldb, beta, c, ldc, dependencies); + her2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +static inline cl::sycl::event herk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + float alpha, const std::complex *a, std::int64_t lda, float beta, std::complex *c, + std::int64_t ldc, const cl::sycl::vector_class &dependencies = {}) { + herk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + auto done = detail::herk(select_backend(queue), queue, upper_lower, trans, n, k, alpha, a, lda, + beta, c, ldc, dependencies); + herk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + return done; +} + +static inline cl::sycl::event herk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + double alpha, const std::complex *a, std::int64_t lda, double beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}) { + herk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + auto done = detail::herk(select_backend(queue), queue, upper_lower, trans, n, k, alpha, a, lda, + beta, c, ldc, dependencies); + herk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + return done; +} + +static inline cl::sycl::event hpmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *a, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}) { + hpmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + auto done = detail::hpmv(select_backend(queue), queue, upper_lower, n, alpha, a, x, incx, beta, + y, incy, dependencies); + hpmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + return done; +} + +static inline cl::sycl::event hpmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *a, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}) { + hpmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + auto done = detail::hpmv(select_backend(queue), queue, upper_lower, n, alpha, a, x, incx, beta, + y, incy, dependencies); + hpmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + return done; +} + +static inline cl::sycl::event hpr( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const std::complex *x, std::int64_t incx, std::complex *a, + const cl::sycl::vector_class &dependencies = {}) { + hpr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + auto done = + detail::hpr(select_backend(queue), queue, upper_lower, n, alpha, x, incx, a, dependencies); + hpr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + return done; +} + +static inline cl::sycl::event hpr( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + const std::complex *x, std::int64_t incx, std::complex *a, + const cl::sycl::vector_class &dependencies = {}) { + hpr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + auto done = + detail::hpr(select_backend(queue), queue, upper_lower, n, alpha, x, incx, a, dependencies); + hpr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + return done; +} + +static inline cl::sycl::event hpr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, + const cl::sycl::vector_class &dependencies = {}) { + hpr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + auto done = detail::hpr2(select_backend(queue), queue, upper_lower, n, alpha, x, incx, y, incy, + a, dependencies); + hpr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + return done; +} + +static inline cl::sycl::event hpr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, + const cl::sycl::vector_class &dependencies = {}) { + hpr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + auto done = detail::hpr2(select_backend(queue), queue, upper_lower, n, alpha, x, incx, y, incy, + a, dependencies); + hpr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + return done; +} + +static inline cl::sycl::event iamax( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies = {}) { + iamax_precondition(queue, n, x, incx, result, dependencies); + auto done = detail::iamax(select_backend(queue), queue, n, x, incx, result, dependencies); + iamax_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +static inline cl::sycl::event iamax( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies = {}) { + iamax_precondition(queue, n, x, incx, result, dependencies); + auto done = detail::iamax(select_backend(queue), queue, n, x, incx, result, dependencies); + iamax_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +static inline cl::sycl::event iamax( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies = {}) { + iamax_precondition(queue, n, x, incx, result, dependencies); + auto done = detail::iamax(select_backend(queue), queue, n, x, incx, result, dependencies); + iamax_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +static inline cl::sycl::event iamax( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies = {}) { + iamax_precondition(queue, n, x, incx, result, dependencies); + auto done = detail::iamax(select_backend(queue), queue, n, x, incx, result, dependencies); + iamax_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +static inline cl::sycl::event iamin( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies = {}) { + iamin_precondition(queue, n, x, incx, result, dependencies); + auto done = detail::iamin(select_backend(queue), queue, n, x, incx, result, dependencies); + iamin_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +static inline cl::sycl::event iamin( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies = {}) { + iamin_precondition(queue, n, x, incx, result, dependencies); + auto done = detail::iamin(select_backend(queue), queue, n, x, incx, result, dependencies); + iamin_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +static inline cl::sycl::event iamin( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies = {}) { + iamin_precondition(queue, n, x, incx, result, dependencies); + auto done = detail::iamin(select_backend(queue), queue, n, x, incx, result, dependencies); + iamin_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +static inline cl::sycl::event iamin( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies = {}) { + iamin_precondition(queue, n, x, incx, result, dependencies); + auto done = detail::iamin(select_backend(queue), queue, n, x, incx, result, dependencies); + iamin_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +static inline cl::sycl::event nrm2( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + float *result, const cl::sycl::vector_class &dependencies = {}) { + nrm2_precondition(queue, n, x, incx, result, dependencies); + auto done = detail::nrm2(select_backend(queue), queue, n, x, incx, result, dependencies); + nrm2_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +static inline cl::sycl::event nrm2( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + double *result, const cl::sycl::vector_class &dependencies = {}) { + nrm2_precondition(queue, n, x, incx, result, dependencies); + auto done = detail::nrm2(select_backend(queue), queue, n, x, incx, result, dependencies); + nrm2_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +static inline cl::sycl::event nrm2( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies = {}) { + nrm2_precondition(queue, n, x, incx, result, dependencies); + auto done = detail::nrm2(select_backend(queue), queue, n, x, incx, result, dependencies); + nrm2_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +static inline cl::sycl::event nrm2( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies = {}) { + nrm2_precondition(queue, n, x, incx, result, dependencies); + auto done = detail::nrm2(select_backend(queue), queue, n, x, incx, result, dependencies); + nrm2_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +static inline cl::sycl::event rot( + cl::sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, float c, float s, + const cl::sycl::vector_class &dependencies = {}) { + rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies); + auto done = detail::rot(select_backend(queue), queue, n, x, incx, y, incy, c, s, dependencies); + rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies); + return done; +} + +static inline cl::sycl::event rot( + cl::sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, double c, double s, + const cl::sycl::vector_class &dependencies = {}) { + rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies); + auto done = detail::rot(select_backend(queue), queue, n, x, incx, y, incy, c, s, dependencies); + rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies); + return done; +} + +static inline cl::sycl::event rot( + cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, float c, float s, + const cl::sycl::vector_class &dependencies = {}) { + rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies); + auto done = detail::rot(select_backend(queue), queue, n, x, incx, y, incy, c, s, dependencies); + rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies); + return done; +} + +static inline cl::sycl::event rot( + cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, + std::int64_t incy, double c, double s, + const cl::sycl::vector_class &dependencies = {}) { + rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies); + auto done = detail::rot(select_backend(queue), queue, n, x, incx, y, incy, c, s, dependencies); + rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies); + return done; +} + +static inline cl::sycl::event rotg( + cl::sycl::queue &queue, float *a, float *b, float *c, float *s, + const cl::sycl::vector_class &dependencies = {}) { + rotg_precondition(queue, a, b, c, s, dependencies); + auto done = detail::rotg(select_backend(queue), queue, a, b, c, s, dependencies); + rotg_postcondition(queue, a, b, c, s, dependencies); + return done; +} + +static inline cl::sycl::event rotg( + cl::sycl::queue &queue, double *a, double *b, double *c, double *s, + const cl::sycl::vector_class &dependencies = {}) { + rotg_precondition(queue, a, b, c, s, dependencies); + auto done = detail::rotg(select_backend(queue), queue, a, b, c, s, dependencies); + rotg_postcondition(queue, a, b, c, s, dependencies); + return done; +} + +static inline cl::sycl::event rotg( + cl::sycl::queue &queue, std::complex *a, std::complex *b, float *c, + std::complex *s, const cl::sycl::vector_class &dependencies = {}) { + rotg_precondition(queue, a, b, c, s, dependencies); + auto done = detail::rotg(select_backend(queue), queue, a, b, c, s, dependencies); + rotg_postcondition(queue, a, b, c, s, dependencies); + return done; +} + +static inline cl::sycl::event rotg( + cl::sycl::queue &queue, std::complex *a, std::complex *b, double *c, + std::complex *s, const cl::sycl::vector_class &dependencies = {}) { + rotg_precondition(queue, a, b, c, s, dependencies); + auto done = detail::rotg(select_backend(queue), queue, a, b, c, s, dependencies); + rotg_postcondition(queue, a, b, c, s, dependencies); + return done; +} + +static inline cl::sycl::event rotm( + cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, float *param, + const cl::sycl::vector_class &dependencies = {}) { + rotm_precondition(queue, n, x, incx, y, incy, param, dependencies); + auto done = + detail::rotm(select_backend(queue), queue, n, x, incx, y, incy, param, dependencies); + rotm_postcondition(queue, n, x, incx, y, incy, param, dependencies); + return done; +} + +static inline cl::sycl::event rotm( + cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, + std::int64_t incy, double *param, + const cl::sycl::vector_class &dependencies = {}) { + rotm_precondition(queue, n, x, incx, y, incy, param, dependencies); + auto done = + detail::rotm(select_backend(queue), queue, n, x, incx, y, incy, param, dependencies); + rotm_postcondition(queue, n, x, incx, y, incy, param, dependencies); + return done; +} + +static inline cl::sycl::event rotmg( + cl::sycl::queue &queue, float *d1, float *d2, float *x1, float y1, float *param, + const cl::sycl::vector_class &dependencies = {}) { + rotmg_precondition(queue, d1, d2, x1, y1, param, dependencies); + auto done = detail::rotmg(select_backend(queue), queue, d1, d2, x1, y1, param, dependencies); + rotmg_postcondition(queue, d1, d2, x1, y1, param, dependencies); + return done; +} + +static inline cl::sycl::event rotmg( + cl::sycl::queue &queue, double *d1, double *d2, double *x1, double y1, double *param, + const cl::sycl::vector_class &dependencies = {}) { + rotmg_precondition(queue, d1, d2, x1, y1, param, dependencies); + auto done = detail::rotmg(select_backend(queue), queue, d1, d2, x1, y1, param, dependencies); + rotmg_postcondition(queue, d1, d2, x1, y1, param, dependencies); + return done; +} + +static inline cl::sycl::event sbmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, float alpha, + const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}) { + sbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = detail::sbmv(select_backend(queue), queue, upper_lower, n, k, alpha, a, lda, x, + incx, beta, y, incy, dependencies); + sbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +static inline cl::sycl::event sbmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, double alpha, + const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}) { + sbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = detail::sbmv(select_backend(queue), queue, upper_lower, n, k, alpha, a, lda, x, + incx, beta, y, incy, dependencies); + sbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +static inline cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, float alpha, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}) { + scal_precondition(queue, n, alpha, x, incx, dependencies); + auto done = detail::scal(select_backend(queue), queue, n, alpha, x, incx, dependencies); + scal_postcondition(queue, n, alpha, x, incx, dependencies); + return done; +} + +static inline cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, double alpha, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}) { + scal_precondition(queue, n, alpha, x, incx, dependencies); + auto done = detail::scal(select_backend(queue), queue, n, alpha, x, incx, dependencies); + scal_postcondition(queue, n, alpha, x, incx, dependencies); + return done; +} + +static inline cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, std::complex alpha, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}) { + scal_precondition(queue, n, alpha, x, incx, dependencies); + auto done = detail::scal(select_backend(queue), queue, n, alpha, x, incx, dependencies); + scal_postcondition(queue, n, alpha, x, incx, dependencies); + return done; +} + +static inline cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, std::complex alpha, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}) { + scal_precondition(queue, n, alpha, x, incx, dependencies); + auto done = detail::scal(select_backend(queue), queue, n, alpha, x, incx, dependencies); + scal_postcondition(queue, n, alpha, x, incx, dependencies); + return done; +} + +static inline cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, float alpha, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}) { + scal_precondition(queue, n, alpha, x, incx, dependencies); + auto done = detail::scal(select_backend(queue), queue, n, alpha, x, incx, dependencies); + scal_postcondition(queue, n, alpha, x, incx, dependencies); + return done; +} + +static inline cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, double alpha, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}) { + scal_precondition(queue, n, alpha, x, incx, dependencies); + auto done = detail::scal(select_backend(queue), queue, n, alpha, x, incx, dependencies); + scal_postcondition(queue, n, alpha, x, incx, dependencies); + return done; +} + +static inline cl::sycl::event sdsdot( + cl::sycl::queue &queue, std::int64_t n, float sb, const float *x, std::int64_t incx, + const float *y, std::int64_t incy, float *result, + const cl::sycl::vector_class &dependencies = {}) { + sdsdot_precondition(queue, n, sb, x, incx, y, incy, result, dependencies); + auto done = + detail::sdsdot(select_backend(queue), queue, n, sb, x, incx, y, incy, result, dependencies); + sdsdot_postcondition(queue, n, sb, x, incx, y, incy, result, dependencies); + return done; +} + +static inline cl::sycl::event spmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *a, + const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}) { + spmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + auto done = detail::spmv(select_backend(queue), queue, upper_lower, n, alpha, a, x, incx, beta, + y, incy, dependencies); + spmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + return done; +} + +static inline cl::sycl::event spmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *a, + const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}) { + spmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + auto done = detail::spmv(select_backend(queue), queue, upper_lower, n, alpha, a, x, incx, beta, + y, incy, dependencies); + spmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + return done; +} + +static inline cl::sycl::event spr( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x, + std::int64_t incx, float *a, const cl::sycl::vector_class &dependencies = {}) { + spr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + auto done = + detail::spr(select_backend(queue), queue, upper_lower, n, alpha, x, incx, a, dependencies); + spr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + return done; +} + +static inline cl::sycl::event spr( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x, + std::int64_t incx, double *a, + const cl::sycl::vector_class &dependencies = {}) { + spr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + auto done = + detail::spr(select_backend(queue), queue, upper_lower, n, alpha, x, incx, a, dependencies); + spr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + return done; +} + +static inline cl::sycl::event spr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, float *a, + const cl::sycl::vector_class &dependencies = {}) { + spr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + auto done = detail::spr2(select_backend(queue), queue, upper_lower, n, alpha, x, incx, y, incy, + a, dependencies); + spr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + return done; +} + +static inline cl::sycl::event spr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x, + std::int64_t incx, const double *y, std::int64_t incy, double *a, + const cl::sycl::vector_class &dependencies = {}) { + spr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + auto done = detail::spr2(select_backend(queue), queue, upper_lower, n, alpha, x, incx, y, incy, + a, dependencies); + spr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + return done; +} + +static inline cl::sycl::event swap( + cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}) { + swap_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = detail::swap(select_backend(queue), queue, n, x, incx, y, incy, dependencies); + swap_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +static inline cl::sycl::event swap( + cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}) { + swap_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = detail::swap(select_backend(queue), queue, n, x, incx, y, incy, dependencies); + swap_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +static inline cl::sycl::event swap( + cl::sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}) { + swap_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = detail::swap(select_backend(queue), queue, n, x, incx, y, incy, dependencies); + swap_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +static inline cl::sycl::event swap( + cl::sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}) { + swap_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = detail::swap(select_backend(queue), queue, n, x, incx, y, incy, dependencies); + swap_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +static inline cl::sycl::event symm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, float beta, + float *c, std::int64_t ldc, const cl::sycl::vector_class &dependencies = {}) { + symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = detail::symm(select_backend(queue), queue, left_right, upper_lower, m, n, alpha, a, + lda, b, ldb, beta, c, ldc, dependencies); + symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +static inline cl::sycl::event symm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + double alpha, const double *a, std::int64_t lda, const double *b, std::int64_t ldb, double beta, + double *c, std::int64_t ldc, const cl::sycl::vector_class &dependencies = {}) { + symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = detail::symm(select_backend(queue), queue, left_right, upper_lower, m, n, alpha, a, + lda, b, ldb, beta, c, ldc, dependencies); + symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +static inline cl::sycl::event symm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}) { + symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = detail::symm(select_backend(queue), queue, left_right, upper_lower, m, n, alpha, a, + lda, b, ldb, beta, c, ldc, dependencies); + symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +static inline cl::sycl::event symm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}) { + symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = detail::symm(select_backend(queue), queue, left_right, upper_lower, m, n, alpha, a, + lda, b, ldb, beta, c, ldc, dependencies); + symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +static inline cl::sycl::event symv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *a, + std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}) { + symv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = detail::symv(select_backend(queue), queue, upper_lower, n, alpha, a, lda, x, incx, + beta, y, incy, dependencies); + symv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +static inline cl::sycl::event symv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *a, + std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}) { + symv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = detail::symv(select_backend(queue), queue, upper_lower, n, alpha, a, lda, x, incx, + beta, y, incy, dependencies); + symv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +static inline cl::sycl::event syr( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x, + std::int64_t incx, float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}) { + syr_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + auto done = detail::syr(select_backend(queue), queue, upper_lower, n, alpha, x, incx, a, lda, + dependencies); + syr_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + return done; +} + +static inline cl::sycl::event syr( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x, + std::int64_t incx, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}) { + syr_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + auto done = detail::syr(select_backend(queue), queue, upper_lower, n, alpha, x, incx, a, lda, + dependencies); + syr_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + return done; +} + +static inline cl::sycl::event syr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}) { + syr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = detail::syr2(select_backend(queue), queue, upper_lower, n, alpha, x, incx, y, incy, + a, lda, dependencies); + syr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +static inline cl::sycl::event syr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x, + std::int64_t incx, const double *y, std::int64_t incy, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}) { + syr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = detail::syr2(select_backend(queue), queue, upper_lower, n, alpha, x, incx, y, incy, + a, lda, dependencies); + syr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +static inline cl::sycl::event syr2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, float beta, + float *c, std::int64_t ldc, const cl::sycl::vector_class &dependencies = {}) { + syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = detail::syr2k(select_backend(queue), queue, upper_lower, trans, n, k, alpha, a, lda, + b, ldb, beta, c, ldc, dependencies); + syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +static inline cl::sycl::event syr2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + double alpha, const double *a, std::int64_t lda, const double *b, std::int64_t ldb, double beta, + double *c, std::int64_t ldc, const cl::sycl::vector_class &dependencies = {}) { + syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = detail::syr2k(select_backend(queue), queue, upper_lower, trans, n, k, alpha, a, lda, + b, ldb, beta, c, ldc, dependencies); + syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +static inline cl::sycl::event syr2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}) { + syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = detail::syr2k(select_backend(queue), queue, upper_lower, trans, n, k, alpha, a, lda, + b, ldb, beta, c, ldc, dependencies); + syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +static inline cl::sycl::event syr2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}) { + syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = detail::syr2k(select_backend(queue), queue, upper_lower, trans, n, k, alpha, a, lda, + b, ldb, beta, c, ldc, dependencies); + syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +static inline cl::sycl::event syrk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + float alpha, const float *a, std::int64_t lda, float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}) { + syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + auto done = detail::syrk(select_backend(queue), queue, upper_lower, trans, n, k, alpha, a, lda, + beta, c, ldc, dependencies); + syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + return done; +} + +static inline cl::sycl::event syrk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + double alpha, const double *a, std::int64_t lda, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}) { + syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + auto done = detail::syrk(select_backend(queue), queue, upper_lower, trans, n, k, alpha, a, lda, + beta, c, ldc, dependencies); + syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + return done; +} + +static inline cl::sycl::event syrk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}) { + syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + auto done = detail::syrk(select_backend(queue), queue, upper_lower, trans, n, k, alpha, a, lda, + beta, c, ldc, dependencies); + syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + return done; +} + +static inline cl::sycl::event syrk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}) { + syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + auto done = detail::syrk(select_backend(queue), queue, upper_lower, trans, n, k, alpha, a, lda, + beta, c, ldc, dependencies); + syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + return done; +} + +static inline cl::sycl::event tbmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}) { + tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = detail::tbmv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, k, a, + lda, x, incx, dependencies); + tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +static inline cl::sycl::event tbmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}) { + tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = detail::tbmv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, k, a, + lda, x, incx, dependencies); + tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +static inline cl::sycl::event tbmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}) { + tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = detail::tbmv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, k, a, + lda, x, incx, dependencies); + tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +static inline cl::sycl::event tbmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}) { + tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = detail::tbmv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, k, a, + lda, x, incx, dependencies); + tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +static inline cl::sycl::event tbsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}) { + tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = detail::tbsv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, k, a, + lda, x, incx, dependencies); + tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +static inline cl::sycl::event tbsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}) { + tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = detail::tbsv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, k, a, + lda, x, incx, dependencies); + tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +static inline cl::sycl::event tbsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}) { + tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = detail::tbsv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, k, a, + lda, x, incx, dependencies); + tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +static inline cl::sycl::event tbsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}) { + tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = detail::tbsv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, k, a, + lda, x, incx, dependencies); + tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +static inline cl::sycl::event tpmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const float *a, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}) { + tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = detail::tpmv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, x, + incx, dependencies); + tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +static inline cl::sycl::event tpmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const double *a, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}) { + tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = detail::tpmv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, x, + incx, dependencies); + tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +static inline cl::sycl::event tpmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}) { + tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = detail::tpmv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, x, + incx, dependencies); + tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +static inline cl::sycl::event tpmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}) { + tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = detail::tpmv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, x, + incx, dependencies); + tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +static inline cl::sycl::event tpsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const float *a, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}) { + tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = detail::tpsv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, x, + incx, dependencies); + tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +static inline cl::sycl::event tpsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const double *a, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}) { + tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = detail::tpsv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, x, + incx, dependencies); + tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +static inline cl::sycl::event tpsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}) { + tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = detail::tpsv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, x, + incx, dependencies); + tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +static inline cl::sycl::event tpsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}) { + tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = detail::tpsv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, x, + incx, dependencies); + tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +static inline cl::sycl::event trmm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b, + std::int64_t ldb, const cl::sycl::vector_class &dependencies = {}) { + trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = detail::trmm(select_backend(queue), queue, left_right, upper_lower, trans, + unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); + trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +static inline cl::sycl::event trmm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b, + std::int64_t ldb, const cl::sycl::vector_class &dependencies = {}) { + trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = detail::trmm(select_backend(queue), queue, left_right, upper_lower, trans, + unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); + trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +static inline cl::sycl::event trmm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}) { + trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = detail::trmm(select_backend(queue), queue, left_right, upper_lower, trans, + unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); + trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +static inline cl::sycl::event trmm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}) { + trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = detail::trmm(select_backend(queue), queue, left_right, upper_lower, trans, + unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); + trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +static inline cl::sycl::event trmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const float *a, std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}) { + trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = detail::trmv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, lda, + x, incx, dependencies); + trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +static inline cl::sycl::event trmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const double *a, std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}) { + trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = detail::trmv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, lda, + x, incx, dependencies); + trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +static inline cl::sycl::event trmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}) { + trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = detail::trmv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, lda, + x, incx, dependencies); + trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +static inline cl::sycl::event trmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}) { + trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = detail::trmv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, lda, + x, incx, dependencies); + trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +static inline cl::sycl::event trsm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b, + std::int64_t ldb, const cl::sycl::vector_class &dependencies = {}) { + trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = detail::trsm(select_backend(queue), queue, left_right, upper_lower, trans, + unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); + trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +static inline cl::sycl::event trsm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b, + std::int64_t ldb, const cl::sycl::vector_class &dependencies = {}) { + trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = detail::trsm(select_backend(queue), queue, left_right, upper_lower, trans, + unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); + trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +static inline cl::sycl::event trsm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}) { + trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = detail::trsm(select_backend(queue), queue, left_right, upper_lower, trans, + unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); + trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +static inline cl::sycl::event trsm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}) { + trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = detail::trsm(select_backend(queue), queue, left_right, upper_lower, trans, + unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); + trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +static inline cl::sycl::event trsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const float *a, std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}) { + trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = detail::trsv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, lda, + x, incx, dependencies); + trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +static inline cl::sycl::event trsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const double *a, std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}) { + trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = detail::trsv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, lda, + x, incx, dependencies); + trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +static inline cl::sycl::event trsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}) { + trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = detail::trsv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, lda, + x, incx, dependencies); + trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +static inline cl::sycl::event trsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}) { + trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = detail::trsv(select_backend(queue), queue, upper_lower, trans, unit_diag, n, a, lda, + x, incx, dependencies); + trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + } //namespace blas } //namespace onemkl diff --git a/include/onemkl/blas/detail/blas_ct_templates.hpp b/include/onemkl/blas/detail/blas_ct_templates.hpp new file mode 100644 index 000000000..765dcd788 --- /dev/null +++ b/include/onemkl/blas/detail/blas_ct_templates.hpp @@ -0,0 +1,2089 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +// +// Generated based on onemkl/blas/blas.hpp +// + +#ifndef _DETAIL_COMMON_BLAS_HPP__ +#define _DETAIL_COMMON_BLAS_HPP__ + +#include +#include + +#include "onemkl/detail/backends.hpp" +#include "onemkl/detail/libraries.hpp" +#include "onemkl/types.hpp" + +namespace onemkl { +namespace blas { + +// Buffer APIs + +template +static inline void syr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + cl::sycl::buffer &x, std::int64_t incx, + cl::sycl::buffer &y, std::int64_t incy, + cl::sycl::buffer &a, std::int64_t lda); + +template +static inline void syr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + cl::sycl::buffer &x, std::int64_t incx, + cl::sycl::buffer &y, std::int64_t incy, + cl::sycl::buffer &a, std::int64_t lda); + +template +static inline void scal(cl::sycl::queue &queue, std::int64_t n, float alpha, + cl::sycl::buffer &x, std::int64_t incx); + +template +static inline void scal(cl::sycl::queue &queue, std::int64_t n, double alpha, + cl::sycl::buffer &x, std::int64_t incx); + +template +static inline void scal(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + cl::sycl::buffer, 1> &x, std::int64_t incx); + +template +static inline void scal(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + cl::sycl::buffer, 1> &x, std::int64_t incx); + +template +static inline void scal(cl::sycl::queue &queue, std::int64_t n, float alpha, + cl::sycl::buffer, 1> &x, std::int64_t incx); + +template +static inline void scal(cl::sycl::queue &queue, std::int64_t n, double alpha, + cl::sycl::buffer, 1> &x, std::int64_t incx); + +template +static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &x, std::int64_t incx); + +template +static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &x, std::int64_t incx); + +template +static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, cl::sycl::buffer, 1> &a, + std::int64_t lda, cl::sycl::buffer, 1> &x, + std::int64_t incx); + +template +static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, cl::sycl::buffer, 1> &a, + std::int64_t lda, cl::sycl::buffer, 1> &x, + std::int64_t incx); + +template +static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, cl::sycl::buffer &a, + cl::sycl::buffer &x, std::int64_t incx); + +template +static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, cl::sycl::buffer &a, + cl::sycl::buffer &x, std::int64_t incx); + +template +static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, cl::sycl::buffer, 1> &a, + cl::sycl::buffer, 1> &x, std::int64_t incx); + +template +static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, cl::sycl::buffer, 1> &a, + cl::sycl::buffer, 1> &x, std::int64_t incx); + +template +static inline void spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + cl::sycl::buffer &x, std::int64_t incx, + cl::sycl::buffer &a); + +template +static inline void spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + cl::sycl::buffer &x, std::int64_t incx, + cl::sycl::buffer &a); + +template +static inline void hpmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, cl::sycl::buffer, 1> &a, + cl::sycl::buffer, 1> &x, std::int64_t incx, + std::complex beta, cl::sycl::buffer, 1> &y, + std::int64_t incy); + +template +static inline void hpmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, cl::sycl::buffer, 1> &a, + cl::sycl::buffer, 1> &x, std::int64_t incx, + std::complex beta, cl::sycl::buffer, 1> &y, + std::int64_t incy); + +template +static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, float alpha, cl::sycl::buffer &a, + std::int64_t lda, float beta, cl::sycl::buffer &c, + std::int64_t ldc); + +template +static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, double alpha, cl::sycl::buffer &a, + std::int64_t lda, double beta, cl::sycl::buffer &c, + std::int64_t ldc); + +template +static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + std::complex beta, cl::sycl::buffer, 1> &c, + std::int64_t ldc); + +template +static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + std::complex beta, cl::sycl::buffer, 1> &c, + std::int64_t ldc); + +template +static inline void her2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, cl::sycl::buffer, 1> &x, + std::int64_t incx, cl::sycl::buffer, 1> &y, + std::int64_t incy, cl::sycl::buffer, 1> &a, + std::int64_t lda); + +template +static inline void her2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, cl::sycl::buffer, 1> &x, + std::int64_t incx, cl::sycl::buffer, 1> &y, + std::int64_t incy, cl::sycl::buffer, 1> &a, + std::int64_t lda); + +template +static inline void hbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, cl::sycl::buffer, 1> &a, + std::int64_t lda, cl::sycl::buffer, 1> &x, + std::int64_t incx, std::complex beta, + cl::sycl::buffer, 1> &y, std::int64_t incy); + +template +static inline void hbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, cl::sycl::buffer, 1> &a, + std::int64_t lda, cl::sycl::buffer, 1> &x, + std::int64_t incx, std::complex beta, + cl::sycl::buffer, 1> &y, std::int64_t incy); + +template +static inline void rot(cl::sycl::queue &queue, std::int64_t n, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer, 1> &y, std::int64_t incy, float c, + float s); + +template +static inline void rot(cl::sycl::queue &queue, std::int64_t n, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer, 1> &y, std::int64_t incy, double c, + double s); + +template +static inline void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, + std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, float c, + float s); + +template +static inline void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, + std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, + double c, double s); + +template +static inline void axpy(cl::sycl::queue &queue, std::int64_t n, float alpha, + cl::sycl::buffer &x, std::int64_t incx, + cl::sycl::buffer &y, std::int64_t incy); + +template +static inline void axpy(cl::sycl::queue &queue, std::int64_t n, double alpha, + cl::sycl::buffer &x, std::int64_t incx, + cl::sycl::buffer &y, std::int64_t incy); + +template +static inline void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer, 1> &y, std::int64_t incy); + +template +static inline void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer, 1> &y, std::int64_t incy); + +template +static inline void gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, cl::sycl::buffer, 1> &x, + std::int64_t incx, cl::sycl::buffer, 1> &y, + std::int64_t incy, cl::sycl::buffer, 1> &a, + std::int64_t lda); + +template +static inline void gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, cl::sycl::buffer, 1> &x, + std::int64_t incx, cl::sycl::buffer, 1> &y, + std::int64_t incy, cl::sycl::buffer, 1> &a, + std::int64_t lda); + +template +static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, float alpha, cl::sycl::buffer &a, + std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, + float beta, cl::sycl::buffer &c, std::int64_t ldc); + +template +static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, double alpha, cl::sycl::buffer &a, + std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, + double beta, cl::sycl::buffer &c, std::int64_t ldc); + +template +static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &b, std::int64_t ldb, + std::complex beta, cl::sycl::buffer, 1> &c, + std::int64_t ldc); + +template +static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &b, std::int64_t ldb, + std::complex beta, cl::sycl::buffer, 1> &c, + std::int64_t ldc); + +template +static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + float alpha, cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &x, std::int64_t incx, float beta, + cl::sycl::buffer &y, std::int64_t incy); + +template +static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + double alpha, cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &x, std::int64_t incx, double beta, + cl::sycl::buffer &y, std::int64_t incy); + +template +static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, cl::sycl::buffer, 1> &a, + std::int64_t lda, cl::sycl::buffer, 1> &x, + std::int64_t incx, std::complex beta, + cl::sycl::buffer, 1> &y, std::int64_t incy); + +template +static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, cl::sycl::buffer, 1> &a, + std::int64_t lda, cl::sycl::buffer, 1> &x, + std::int64_t incx, std::complex beta, + cl::sycl::buffer, 1> &y, std::int64_t incy); + +template +static inline void her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer, 1> &a, std::int64_t lda); + +template +static inline void her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer, 1> &a, std::int64_t lda); + +template +static inline void hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer, 1> &a); + +template +static inline void hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer, 1> &a); + +template +static inline void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, + std::int64_t incx, cl::sycl::buffer &result); + +template +static inline void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, + std::int64_t incx, cl::sycl::buffer &result); + +template +static inline void iamin(cl::sycl::queue &queue, std::int64_t n, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer &result); + +template +static inline void iamin(cl::sycl::queue &queue, std::int64_t n, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer &result); + +template +static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + cl::sycl::buffer &a, std::int64_t lda, + std::int64_t stride_a, cl::sycl::buffer &b, + std::int64_t ldb, std::int64_t stride_b, float beta, + cl::sycl::buffer &c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size); + +template +static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, + cl::sycl::buffer &a, std::int64_t lda, + std::int64_t stride_a, cl::sycl::buffer &b, + std::int64_t ldb, std::int64_t stride_b, double beta, + cl::sycl::buffer &c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size); + +template +static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + std::int64_t stride_a, cl::sycl::buffer, 1> &b, + std::int64_t ldb, std::int64_t stride_b, std::complex beta, + cl::sycl::buffer, 1> &c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size); + +template +static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + std::int64_t stride_a, cl::sycl::buffer, 1> &b, + std::int64_t ldb, std::int64_t stride_b, std::complex beta, + cl::sycl::buffer, 1> &c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size); + +template +static inline void spmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + cl::sycl::buffer &a, cl::sycl::buffer &x, + std::int64_t incx, float beta, cl::sycl::buffer &y, + std::int64_t incy); + +template +static inline void spmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + cl::sycl::buffer &a, cl::sycl::buffer &x, + std::int64_t incx, double beta, cl::sycl::buffer &y, + std::int64_t incy); + +template +static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &b, std::int64_t ldb, float beta, + cl::sycl::buffer &c, std::int64_t ldc); + +template +static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, + offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, + float alpha, cl::sycl::buffer &a, std::int64_t lda, + int8_t ao, cl::sycl::buffer &b, std::int64_t ldb, + uint8_t bo, float beta, cl::sycl::buffer &c, + std::int64_t ldc, cl::sycl::buffer &co); + +template +static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &b, std::int64_t ldb, float beta, + cl::sycl::buffer &c, std::int64_t ldc); + +template +static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, + cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &b, std::int64_t ldb, double beta, + cl::sycl::buffer &c, std::int64_t ldc); + +template +static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, cl::sycl::buffer, 1> &a, + std::int64_t lda, cl::sycl::buffer, 1> &b, + std::int64_t ldb, std::complex beta, + cl::sycl::buffer, 1> &c, std::int64_t ldc); + +template +static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &b, std::int64_t ldb, + std::complex beta, cl::sycl::buffer, 1> &c, + std::int64_t ldc); + +template +static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, half alpha, + cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &b, std::int64_t ldb, half beta, + cl::sycl::buffer &c, std::int64_t ldc); + +template +static inline void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, + std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy); + +template +static inline void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, + std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy); + +template +static inline void swap(cl::sycl::queue &queue, std::int64_t n, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer, 1> &y, std::int64_t incy); + +template +static inline void swap(cl::sycl::queue &queue, std::int64_t n, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer, 1> &y, std::int64_t incy); + +template +static inline void geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, cl::sycl::buffer, 1> &x, + std::int64_t incx, cl::sycl::buffer, 1> &y, + std::int64_t incy, cl::sycl::buffer, 1> &a, + std::int64_t lda); + +template +static inline void geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, cl::sycl::buffer, 1> &x, + std::int64_t incx, cl::sycl::buffer, 1> &y, + std::int64_t incy, cl::sycl::buffer, 1> &a, + std::int64_t lda); + +template +static inline void nrm2(cl::sycl::queue &queue, std::int64_t n, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer &result); + +template +static inline void nrm2(cl::sycl::queue &queue, std::int64_t n, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer &result); + +template +static inline void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, + std::int64_t incx, cl::sycl::buffer &result); + +template +static inline void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, + std::int64_t incx, cl::sycl::buffer &result); + +template +static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer &a, + std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, + float beta, cl::sycl::buffer &c, std::int64_t ldc); + +template +static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, double alpha, + cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &b, std::int64_t ldb, double beta, + cl::sycl::buffer &c, std::int64_t ldc); + +template +static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &b, std::int64_t ldb, + std::complex beta, cl::sycl::buffer, 1> &c, + std::int64_t ldc); + +template +static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &b, std::int64_t ldb, + std::complex beta, cl::sycl::buffer, 1> &c, + std::int64_t ldc); + +template +static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, half alpha, cl::sycl::buffer &a, + std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, half beta, + cl::sycl::buffer &c, std::int64_t ldc); + +template +static inline void herk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, float alpha, cl::sycl::buffer, 1> &a, + std::int64_t lda, float beta, cl::sycl::buffer, 1> &c, + std::int64_t ldc); + +template +static inline void herk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, double alpha, cl::sycl::buffer, 1> &a, + std::int64_t lda, double beta, cl::sycl::buffer, 1> &c, + std::int64_t ldc); + +template +static inline void ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, + cl::sycl::buffer &x, std::int64_t incx, + cl::sycl::buffer &y, std::int64_t incy, + cl::sycl::buffer &a, std::int64_t lda); + +template +static inline void ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, + cl::sycl::buffer &x, std::int64_t incx, + cl::sycl::buffer &y, std::int64_t incy, + cl::sycl::buffer &a, std::int64_t lda); + +template +static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, float alpha, + cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &b, std::int64_t ldb); + +template +static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, double alpha, + cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &b, std::int64_t ldb); + +template +static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &b, std::int64_t ldb); + +template +static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &b, std::int64_t ldb); + +template +static inline void dotu(cl::sycl::queue &queue, std::int64_t n, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer, 1> &y, std::int64_t incy, + cl::sycl::buffer, 1> &result); + +template +static inline void dotu(cl::sycl::queue &queue, std::int64_t n, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer, 1> &y, std::int64_t incy, + cl::sycl::buffer, 1> &result); + +template +static inline void hemm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &b, std::int64_t ldb, + std::complex beta, cl::sycl::buffer, 1> &c, + std::int64_t ldc); + +template +static inline void hemm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &b, std::int64_t ldb, + std::complex beta, cl::sycl::buffer, 1> &c, + std::int64_t ldc); + +template +static inline void hpr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, cl::sycl::buffer, 1> &x, + std::int64_t incx, cl::sycl::buffer, 1> &y, + std::int64_t incy, cl::sycl::buffer, 1> &a); + +template +static inline void hpr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, cl::sycl::buffer, 1> &x, + std::int64_t incx, cl::sycl::buffer, 1> &y, + std::int64_t incy, cl::sycl::buffer, 1> &a); + +template +static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, float alpha, + cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &x, std::int64_t incx, float beta, + cl::sycl::buffer &y, std::int64_t incy); + +template +static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, double alpha, + cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &x, std::int64_t incx, double beta, + cl::sycl::buffer &y, std::int64_t incy); + +template +static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &x, std::int64_t incx, + std::complex beta, cl::sycl::buffer, 1> &y, + std::int64_t incy); + +template +static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &x, std::int64_t incx, + std::complex beta, cl::sycl::buffer, 1> &y, + std::int64_t incy); + +template +static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, cl::sycl::buffer &a, + std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx); + +template +static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, cl::sycl::buffer &a, + std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx); + +template +static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, cl::sycl::buffer, 1> &a, + std::int64_t lda, cl::sycl::buffer, 1> &x, + std::int64_t incx); + +template +static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &x, std::int64_t incx); + +template +static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, float alpha, cl::sycl::buffer &a, + std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, + float beta, cl::sycl::buffer &c, std::int64_t ldc); + +template +static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, double alpha, cl::sycl::buffer &a, + std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, + double beta, cl::sycl::buffer &c, std::int64_t ldc); + +template +static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &b, std::int64_t ldb, + std::complex beta, cl::sycl::buffer, 1> &c, + std::int64_t ldc); + +template +static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &b, std::int64_t ldb, + std::complex beta, cl::sycl::buffer, 1> &c, + std::int64_t ldc); + +template +static inline void dotc(cl::sycl::queue &queue, std::int64_t n, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer, 1> &y, std::int64_t incy, + cl::sycl::buffer, 1> &result); + +template +static inline void dotc(cl::sycl::queue &queue, std::int64_t n, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer, 1> &y, std::int64_t incy, + cl::sycl::buffer, 1> &result); + +template +static inline void syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + cl::sycl::buffer &x, std::int64_t incx, + cl::sycl::buffer &a, std::int64_t lda); + +template +static inline void syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + cl::sycl::buffer &x, std::int64_t incx, + cl::sycl::buffer &a, std::int64_t lda); + +template +static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, float alpha, + cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &b, std::int64_t ldb); + +template +static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, double alpha, + cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &b, std::int64_t ldb); + +template +static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &b, std::int64_t ldb); + +template +static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &b, std::int64_t ldb); + +template +static inline void rotmg(cl::sycl::queue &queue, cl::sycl::buffer &d1, + cl::sycl::buffer &d2, cl::sycl::buffer &x1, float y1, + cl::sycl::buffer ¶m); + +template +static inline void rotmg(cl::sycl::queue &queue, cl::sycl::buffer &d1, + cl::sycl::buffer &d2, cl::sycl::buffer &x1, + double y1, cl::sycl::buffer ¶m); + +template +static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, cl::sycl::buffer &a, + cl::sycl::buffer &x, std::int64_t incx); + +template +static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, cl::sycl::buffer &a, + cl::sycl::buffer &x, std::int64_t incx); + +template +static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, cl::sycl::buffer, 1> &a, + cl::sycl::buffer, 1> &x, std::int64_t incx); + +template +static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, cl::sycl::buffer, 1> &a, + cl::sycl::buffer, 1> &x, std::int64_t incx); + +template +static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &x, std::int64_t incx); + +template +static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &x, std::int64_t incx); + +template +static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, cl::sycl::buffer, 1> &a, + std::int64_t lda, cl::sycl::buffer, 1> &x, + std::int64_t incx); + +template +static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, cl::sycl::buffer, 1> &a, + std::int64_t lda, cl::sycl::buffer, 1> &x, + std::int64_t incx); + +template +static inline void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, + std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy); + +template +static inline void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, + std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy); + +template +static inline void copy(cl::sycl::queue &queue, std::int64_t n, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer, 1> &y, std::int64_t incy); + +template +static inline void copy(cl::sycl::queue &queue, std::int64_t n, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer, 1> &y, std::int64_t incy); + +template +static inline void hemv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, cl::sycl::buffer, 1> &a, + std::int64_t lda, cl::sycl::buffer, 1> &x, + std::int64_t incx, std::complex beta, + cl::sycl::buffer, 1> &y, std::int64_t incy); + +template +static inline void hemv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, cl::sycl::buffer, 1> &a, + std::int64_t lda, cl::sycl::buffer, 1> &x, + std::int64_t incx, std::complex beta, + cl::sycl::buffer, 1> &y, std::int64_t incy); + +template +static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, float alpha, + cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &b, std::int64_t ldb, float beta, + cl::sycl::buffer &c, std::int64_t ldc); + +template +static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, double alpha, + cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &b, std::int64_t ldb, double beta, + cl::sycl::buffer &c, std::int64_t ldc); + +template +static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, + std::complex alpha, cl::sycl::buffer, 1> &a, + std::int64_t lda, cl::sycl::buffer, 1> &b, + std::int64_t ldb, std::complex beta, + cl::sycl::buffer, 1> &c, std::int64_t ldc); + +template +static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, + std::complex alpha, cl::sycl::buffer, 1> &a, + std::int64_t lda, cl::sycl::buffer, 1> &b, + std::int64_t ldb, std::complex beta, + cl::sycl::buffer, 1> &c, std::int64_t ldc); + +template +static inline void sbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, + float alpha, cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &x, std::int64_t incx, float beta, + cl::sycl::buffer &y, std::int64_t incy); + +template +static inline void sbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, + double alpha, cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &x, std::int64_t incx, double beta, + cl::sycl::buffer &y, std::int64_t incy); + +template +static inline void asum(cl::sycl::queue &queue, std::int64_t n, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer &result); + +template +static inline void asum(cl::sycl::queue &queue, std::int64_t n, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer &result); + +template +static inline void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, + std::int64_t incx, cl::sycl::buffer &result); + +template +static inline void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, + std::int64_t incx, cl::sycl::buffer &result); + +template +static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, cl::sycl::buffer &a, + std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx); + +template +static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, cl::sycl::buffer &a, + std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx); + +template +static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, cl::sycl::buffer, 1> &a, + std::int64_t lda, cl::sycl::buffer, 1> &x, + std::int64_t incx); + +template +static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &x, std::int64_t incx); + +template +static inline void spr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + cl::sycl::buffer &x, std::int64_t incx, + cl::sycl::buffer &y, std::int64_t incy, + cl::sycl::buffer &a); + +template +static inline void spr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + cl::sycl::buffer &x, std::int64_t incx, + cl::sycl::buffer &y, std::int64_t incy, + cl::sycl::buffer &a); + +template +static inline void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, + std::int64_t incx, cl::sycl::buffer &result); + +template +static inline void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, + std::int64_t incx, cl::sycl::buffer &result); + +template +static inline void iamax(cl::sycl::queue &queue, std::int64_t n, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer &result); + +template +static inline void iamax(cl::sycl::queue &queue, std::int64_t n, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer &result); + +template +static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + float alpha, cl::sycl::buffer &a, std::int64_t lda, + std::int64_t stride_a, cl::sycl::buffer &b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); + +template +static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + double alpha, cl::sycl::buffer &a, std::int64_t lda, + std::int64_t stride_a, cl::sycl::buffer &b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); + +template +static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + std::int64_t stride_a, cl::sycl::buffer, 1> &b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); + +template +static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + std::int64_t stride_a, cl::sycl::buffer, 1> &b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); + +template +static inline void rotm(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, + std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, + cl::sycl::buffer ¶m); + +template +static inline void rotm(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, + std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, + cl::sycl::buffer ¶m); + +template +static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer &a, + cl::sycl::buffer &b, cl::sycl::buffer &c, + cl::sycl::buffer &s); + +template +static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer &a, + cl::sycl::buffer &b, cl::sycl::buffer &c, + cl::sycl::buffer &s); + +template +static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer, 1> &a, + cl::sycl::buffer, 1> &b, cl::sycl::buffer &c, + cl::sycl::buffer, 1> &s); + +template +static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer, 1> &a, + cl::sycl::buffer, 1> &b, + cl::sycl::buffer &c, + cl::sycl::buffer, 1> &s); + +template +static inline void sdsdot(cl::sycl::queue &queue, std::int64_t n, float sb, + cl::sycl::buffer &x, std::int64_t incx, + cl::sycl::buffer &y, std::int64_t incy, + cl::sycl::buffer &result); + +template +static inline void her2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &b, std::int64_t ldb, float beta, + cl::sycl::buffer, 1> &c, std::int64_t ldc); + +template +static inline void her2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &b, std::int64_t ldb, + double beta, cl::sycl::buffer, 1> &c, + std::int64_t ldc); + +template +static inline void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, + std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, + cl::sycl::buffer &result); + +template +static inline void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, + std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, + cl::sycl::buffer &result); + +template +static inline void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, + std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, + cl::sycl::buffer &result); + +template +static inline void symv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &x, std::int64_t incx, float beta, + cl::sycl::buffer &y, std::int64_t incy); + +template +static inline void symv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &x, std::int64_t incx, double beta, + cl::sycl::buffer &y, std::int64_t incy); + +// USM APIs + +template +static inline cl::sycl::event syr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event syr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x, + std::int64_t incx, const double *y, std::int64_t incy, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, float alpha, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, double alpha, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, std::complex alpha, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, std::complex alpha, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, float alpha, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, double alpha, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event trmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const float *a, std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event trmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const double *a, std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event trmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event trmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event tpmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const float *a, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event tpmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const double *a, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event tpmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event tpmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + float alpha, const float *x, std::int64_t incx, float *a, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + double alpha, const double *x, std::int64_t incx, double *a, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event hpmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *a, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event hpmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *a, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event syrk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + float alpha, const float *a, std::int64_t lda, float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event syrk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + double alpha, const double *a, std::int64_t lda, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event syrk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event syrk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event her2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event her2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event hbmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event hbmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + float c, float s, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + double c, double s, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, float *x, + std::int64_t incx, float *y, std::int64_t incy, float c, float s, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, double *x, + std::int64_t incx, double *y, std::int64_t incy, double c, + double s, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event axpy( + cl::sycl::queue &queue, std::int64_t n, float alpha, const float *x, std::int64_t incx, + float *y, std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event axpy( + cl::sycl::queue &queue, std::int64_t n, double alpha, const double *x, std::int64_t incx, + double *y, std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event axpy( + cl::sycl::queue &queue, std::int64_t n, std::complex alpha, const std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event axpy( + cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event axpy_batch( + cl::sycl::queue &queue, std::int64_t *n, float *alpha, const float **x, std::int64_t *incx, + float **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event axpy_batch( + cl::sycl::queue &queue, std::int64_t *n, double *alpha, const double **x, std::int64_t *incx, + double **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event axpy_batch( + cl::sycl::queue &queue, std::int64_t *n, std::complex *alpha, + const std::complex **x, std::int64_t *incx, std::complex **y, std::int64_t *incy, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event axpy_batch( + cl::sycl::queue &queue, std::int64_t *n, std::complex *alpha, + const std::complex **x, std::int64_t *incx, std::complex **y, + std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event gerc( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event gerc( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event syr2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, float beta, + float *c, std::int64_t ldc, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event syr2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + double alpha, const double *a, std::int64_t lda, const double *b, std::int64_t ldb, double beta, + double *c, std::int64_t ldc, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event syr2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event syr2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event gemv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha, + const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event gemv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, + const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event gemv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event gemv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + float alpha, const std::complex *x, std::int64_t incx, + std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + double alpha, const std::complex *x, std::int64_t incx, + std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + float alpha, const std::complex *x, std::int64_t incx, + std::complex *a, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + double alpha, const std::complex *x, std::int64_t incx, + std::complex *a, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event iamin( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event iamin( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event iamin( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event iamin( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, float *alpha, const float **a, std::int64_t *lda, const float **b, + std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, std::int64_t group_count, + std::int64_t *group_size, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, double *alpha, const double **a, std::int64_t *lda, const double **b, + std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc, std::int64_t group_count, + std::int64_t *group_size, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, std::complex *alpha, const std::complex **a, std::int64_t *lda, + const std::complex **b, std::int64_t *ldb, std::complex *beta, + std::complex **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, std::complex *alpha, const std::complex **a, std::int64_t *lda, + const std::complex **b, std::int64_t *ldb, std::complex *beta, + std::complex **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const float *a, std::int64_t lda, std::int64_t stride_a, + const float *b, std::int64_t ldb, std::int64_t stride_b, float beta, float *c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, double alpha, const double *a, std::int64_t lda, std::int64_t stride_a, + const double *b, std::int64_t ldb, std::int64_t stride_b, double beta, double *c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + std::int64_t stride_a, const std::complex *b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, std::complex *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + std::int64_t stride_a, const std::complex *b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, std::complex *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event spmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *a, + const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event spmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *a, + const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event swap( + cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event swap( + cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event swap( + cl::sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event swap( + cl::sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event geru( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event geru( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event nrm2( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + float *result, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event nrm2( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + double *result, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event nrm2( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event nrm2( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event gemm( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, + float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event gemm( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *b, + std::int64_t ldb, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event gemm( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event gemm( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event herk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + float alpha, const std::complex *a, std::int64_t lda, float beta, std::complex *c, + std::int64_t ldc, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event herk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + double alpha, const std::complex *a, std::int64_t lda, double beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + float alpha, const float *x, std::int64_t incx, const float *y, + std::int64_t incy, float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + double alpha, const double *x, std::int64_t incx, const double *y, + std::int64_t incy, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event trsm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b, + std::int64_t ldb, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event trsm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b, + std::int64_t ldb, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event trsm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event trsm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event dotu( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event dotu( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event hemm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event hemm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event hpr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event hpr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event gbmv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, float alpha, const float *a, std::int64_t lda, const float *x, + std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event gbmv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, double alpha, const double *a, std::int64_t lda, const double *x, + std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event gbmv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event gbmv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event tbmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event tbmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event tbmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event tbmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event symm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, float beta, + float *c, std::int64_t ldc, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event symm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + double alpha, const double *a, std::int64_t lda, const double *b, std::int64_t ldb, double beta, + double *c, std::int64_t ldc, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event symm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event symm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event dotc( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event dotc( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + float alpha, const float *x, std::int64_t incx, float *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + double alpha, const double *x, std::int64_t incx, double *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event trmm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b, + std::int64_t ldb, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event trmm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b, + std::int64_t ldb, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event trmm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event trmm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event rotmg( + cl::sycl::queue &queue, float *d1, float *d2, float *x1, float y1, float *param, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event rotmg( + cl::sycl::queue &queue, double *d1, double *d2, double *x1, double y1, double *param, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event tpsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const float *a, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event tpsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const double *a, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event tpsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event tpsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event trsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const float *a, std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event trsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const double *a, std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event trsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event trsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event copy( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event copy( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event copy( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event copy( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event hemv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event hemv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *x, + std::int64_t incx, std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event gemmt( + cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, + float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event gemmt( + cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *b, + std::int64_t ldb, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event gemmt( + cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event gemmt( + cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event sbmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, float alpha, + const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event sbmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, double alpha, + const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event asum( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + float *result, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event asum( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + double *result, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event asum( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event asum( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event tbsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event tbsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event tbsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event tbsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event spr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, float *a, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event spr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x, + std::int64_t incx, const double *y, std::int64_t incy, double *a, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event iamax( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event iamax( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event iamax( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event iamax( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event rotm( + cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, float *param, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event rotm( + cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, + std::int64_t incy, double *param, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event rotg( + cl::sycl::queue &queue, float *a, float *b, float *c, float *s, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event rotg( + cl::sycl::queue &queue, double *a, double *b, double *c, double *s, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event rotg( + cl::sycl::queue &queue, std::complex *a, std::complex *b, float *c, + std::complex *s, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event rotg( + cl::sycl::queue &queue, std::complex *a, std::complex *b, double *c, + std::complex *s, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event sdsdot( + cl::sycl::queue &queue, std::int64_t n, float sb, const float *x, std::int64_t incx, + const float *y, std::int64_t incy, float *result, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event her2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, float beta, std::complex *c, + std::int64_t ldc, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event her2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, double beta, std::complex *c, + std::int64_t ldc, const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, + float *result, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const double *x, + std::int64_t incx, const double *y, std::int64_t incy, + double *result, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, + double *result, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event symv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *a, + std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +template +static inline cl::sycl::event symv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *a, + std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +} //namespace blas +} //namespace onemkl + +#endif //_DETAIL_COMMON_BLAS_HPP__ diff --git a/include/onemkl/blas/detail/blas_loader.hpp b/include/onemkl/blas/detail/blas_loader.hpp index e14766f81..bec2b9fb5 100644 --- a/include/onemkl/blas/detail/blas_loader.hpp +++ b/include/onemkl/blas/detail/blas_loader.hpp @@ -31,6 +31,8 @@ namespace onemkl { namespace blas { namespace detail { +// Buffer APIs + ONEMKL_EXPORT void herk(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, float beta, @@ -91,42 +93,6 @@ ONEMKL_EXPORT void spr(char *libname, cl::sycl::queue &queue, uplo upper_lower, double alpha, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &a); -ONEMKL_EXPORT void gemm_batch( - char *libname, cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, cl::sycl::buffer &beta, - cl::sycl::buffer &c, cl::sycl::buffer &ldc, std::int64_t group_count, - cl::sycl::buffer &group_size); -ONEMKL_EXPORT void gemm_batch( - char *libname, cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, cl::sycl::buffer &beta, - cl::sycl::buffer &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size); -ONEMKL_EXPORT void gemm_batch( - char *libname, cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size); -ONEMKL_EXPORT void gemm_batch( - char *libname, cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size); ONEMKL_EXPORT void gemm_batch(char *libname, cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer &a, std::int64_t lda, @@ -757,38 +723,6 @@ ONEMKL_EXPORT void spr2(char *libname, cl::sycl::queue &queue, uplo upper_lower, cl::sycl::buffer &y, std::int64_t incy, cl::sycl::buffer &a); -ONEMKL_EXPORT void trsm_batch( - char *libname, cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, std::int64_t group_count, - cl::sycl::buffer &group_size); -ONEMKL_EXPORT void trsm_batch( - char *libname, cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size); -ONEMKL_EXPORT void trsm_batch( - char *libname, cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size); -ONEMKL_EXPORT void trsm_batch( - char *libname, cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size); ONEMKL_EXPORT void trsm_batch(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, cl::sycl::buffer &a, @@ -868,6 +802,797 @@ ONEMKL_EXPORT void rotg(char *libname, cl::sycl::queue &queue, cl::sycl::buffer, 1> &b, cl::sycl::buffer &c, cl::sycl::buffer, 1> &s); + +// USM APIs + +ONEMKL_EXPORT cl::sycl::event herk( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, float alpha, const std::complex *a, std::int64_t lda, float beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event herk( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, double alpha, const std::complex *a, std::int64_t lda, double beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event scal( + char *libname, cl::sycl::queue &queue, std::int64_t n, float alpha, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event scal( + char *libname, cl::sycl::queue &queue, std::int64_t n, double alpha, double *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event scal( + char *libname, cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event scal( + char *libname, cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event scal( + char *libname, cl::sycl::queue &queue, std::int64_t n, float alpha, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event scal( + char *libname, cl::sycl::queue &queue, std::int64_t n, double alpha, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trmv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const float *a, std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event trmv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const double *a, std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event trmv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event trmv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tpmv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const float *a, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event tpmv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const double *a, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event tpmv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex *a, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event tpmv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex *a, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event spr(char *libname, cl::sycl::queue &queue, uplo upper_lower, + std::int64_t n, float alpha, const float *x, std::int64_t incx, + float *a, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event spr(char *libname, cl::sycl::queue &queue, uplo upper_lower, + std::int64_t n, double alpha, const double *x, std::int64_t incx, + double *a, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemm_batch( + char *libname, cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, + std::int64_t *n, std::int64_t *k, float *alpha, const float **a, std::int64_t *lda, + const float **b, std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event gemm_batch( + char *libname, cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, + std::int64_t *n, std::int64_t *k, double *alpha, const double **a, std::int64_t *lda, + const double **b, std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event gemm_batch( + char *libname, cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, + std::int64_t *n, std::int64_t *k, std::complex *alpha, const std::complex **a, + std::int64_t *lda, const std::complex **b, std::int64_t *ldb, std::complex *beta, + std::complex **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event gemm_batch( + char *libname, cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, + std::int64_t *n, std::int64_t *k, std::complex *alpha, const std::complex **a, + std::int64_t *lda, const std::complex **b, std::int64_t *ldb, + std::complex *beta, std::complex **c, std::int64_t *ldc, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event gemm_batch( + char *libname, cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, + std::int64_t stride_a, const float *b, std::int64_t ldb, std::int64_t stride_b, float beta, + float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event gemm_batch( + char *libname, cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, double alpha, const double *a, std::int64_t lda, + std::int64_t stride_a, const double *b, std::int64_t ldb, std::int64_t stride_b, double beta, + double *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event gemm_batch( + char *libname, cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex alpha, const std::complex *a, + std::int64_t lda, std::int64_t stride_a, const std::complex *b, std::int64_t ldb, + std::int64_t stride_b, std::complex beta, std::complex *c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event gemm_batch( + char *libname, cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex alpha, const std::complex *a, + std::int64_t lda, std::int64_t stride_a, const std::complex *b, std::int64_t ldb, + std::int64_t stride_b, std::complex beta, std::complex *c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event syrk( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, float alpha, const float *a, std::int64_t lda, float beta, float *c, + std::int64_t ldc, const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event syrk( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, double alpha, const double *a, std::int64_t lda, double beta, double *c, + std::int64_t ldc, const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event syrk( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event syrk( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event her2( + char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event her2( + char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event hbmv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event hbmv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event rot(char *libname, cl::sycl::queue &queue, std::int64_t n, + std::complex *x, std::int64_t incx, std::complex *y, + std::int64_t incy, float c, float s, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event rot(char *libname, cl::sycl::queue &queue, std::int64_t n, + std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, double c, double s, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event rot(char *libname, cl::sycl::queue &queue, std::int64_t n, float *x, + std::int64_t incx, float *y, std::int64_t incy, float c, float s, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event rot(char *libname, cl::sycl::queue &queue, std::int64_t n, double *x, + std::int64_t incx, double *y, std::int64_t incy, double c, + double s, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event axpy( + char *libname, cl::sycl::queue &queue, std::int64_t n, float alpha, const float *x, + std::int64_t incx, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event axpy( + char *libname, cl::sycl::queue &queue, std::int64_t n, double alpha, const double *x, + std::int64_t incx, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event axpy( + char *libname, cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event axpy( + char *libname, cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event axpy_batch( + char *libname, cl::sycl::queue &queue, std::int64_t *n, float *alpha, const float **x, + std::int64_t *incx, float **y, std::int64_t *incy, std::int64_t group_count, + std::int64_t *group_size, const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event axpy_batch( + char *libname, cl::sycl::queue &queue, std::int64_t *n, double *alpha, const double **x, + std::int64_t *incx, double **y, std::int64_t *incy, std::int64_t group_count, + std::int64_t *group_size, const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event axpy_batch( + char *libname, cl::sycl::queue &queue, std::int64_t *n, std::complex *alpha, + const std::complex **x, std::int64_t *incx, std::complex **y, std::int64_t *incy, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event axpy_batch( + char *libname, cl::sycl::queue &queue, std::int64_t *n, std::complex *alpha, + const std::complex **x, std::int64_t *incx, std::complex **y, + std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gerc( + char *libname, cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event gerc( + char *libname, cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event syr2k( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, + float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event syr2k( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *b, + std::int64_t ldb, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event syr2k( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event syr2k( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemv( + char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + float alpha, const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, + float *y, std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event gemv( + char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + double alpha, const double *a, std::int64_t lda, const double *x, std::int64_t incx, + double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event gemv( + char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event gemv( + char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event her(char *libname, cl::sycl::queue &queue, uplo upper_lower, + std::int64_t n, float alpha, const std::complex *x, + std::int64_t incx, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event her(char *libname, cl::sycl::queue &queue, uplo upper_lower, + std::int64_t n, double alpha, const std::complex *x, + std::int64_t incx, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event hpr(char *libname, cl::sycl::queue &queue, uplo upper_lower, + std::int64_t n, float alpha, const std::complex *x, + std::int64_t incx, std::complex *a, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event hpr(char *libname, cl::sycl::queue &queue, uplo upper_lower, + std::int64_t n, double alpha, const std::complex *x, + std::int64_t incx, std::complex *a, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event iamin( + char *libname, cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event iamin( + char *libname, cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event iamin( + char *libname, cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event iamin( + char *libname, cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event hpmv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *a, const std::complex *x, + std::int64_t incx, std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event hpmv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *a, const std::complex *x, + std::int64_t incx, std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event spmv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const float *a, const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event spmv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + const double *a, const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event rotmg( + char *libname, cl::sycl::queue &queue, float *d1, float *d2, float *x1, float y1, float *param, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event rotmg( + char *libname, cl::sycl::queue &queue, double *d1, double *d2, double *x1, double y1, + double *param, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event swap( + char *libname, cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event swap( + char *libname, cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event swap( + char *libname, cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event swap( + char *libname, cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event geru( + char *libname, cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event geru( + char *libname, cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event nrm2( + char *libname, cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event nrm2( + char *libname, cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event nrm2( + char *libname, cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + float *result, const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event nrm2( + char *libname, cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + double *result, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemmt( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, + std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, + std::int64_t ldb, float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event gemmt( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, + std::int64_t n, std::int64_t k, double alpha, const double *a, std::int64_t lda, + const double *b, std::int64_t ldb, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event gemmt( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, + std::int64_t n, std::int64_t k, std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event gemmt( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, + std::int64_t n, std::int64_t k, std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemm( + char *libname, cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, + std::int64_t ldb, float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event gemm( + char *libname, cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, double alpha, const double *a, std::int64_t lda, + const double *b, std::int64_t ldb, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event gemm( + char *libname, cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event gemm( + char *libname, cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event syr2( + char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a, + std::int64_t lda, const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event syr2( + char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + const double *x, std::int64_t incx, const double *y, std::int64_t incy, double *a, + std::int64_t lda, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event ger(char *libname, cl::sycl::queue &queue, std::int64_t m, + std::int64_t n, float alpha, const float *x, std::int64_t incx, + const float *y, std::int64_t incy, float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event ger(char *libname, cl::sycl::queue &queue, std::int64_t m, + std::int64_t n, double alpha, const double *x, std::int64_t incx, + const double *y, std::int64_t incy, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trsm( + char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, + float *b, std::int64_t ldb, const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event trsm( + char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, + double *b, std::int64_t ldb, const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event trsm( + char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event trsm( + char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event dotu( + char *libname, cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event dotu( + char *libname, cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *result, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event hemm( + char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event hemm( + char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event hpr2( + char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event hpr2( + char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gbmv( + char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, float alpha, const float *a, std::int64_t lda, const float *x, + std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event gbmv( + char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, double alpha, const double *a, std::int64_t lda, + const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event gbmv( + char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event gbmv( + char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tbmv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event tbmv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event tbmv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const std::complex *a, std::int64_t lda, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event tbmv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const std::complex *a, std::int64_t lda, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event symm( + char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, + float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event symm( + char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, double alpha, const double *a, std::int64_t lda, const double *b, + std::int64_t ldb, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event symm( + char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event symm( + char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event dotc( + char *libname, cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event dotc( + char *libname, cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *result, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event syr(char *libname, cl::sycl::queue &queue, uplo upper_lower, + std::int64_t n, float alpha, const float *x, std::int64_t incx, + float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event syr(char *libname, cl::sycl::queue &queue, uplo upper_lower, + std::int64_t n, double alpha, const double *x, std::int64_t incx, + double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trmm( + char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, + float *b, std::int64_t ldb, const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event trmm( + char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, + double *b, std::int64_t ldb, const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event trmm( + char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event trmm( + char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event symv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event symv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tpsv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const float *a, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event tpsv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const double *a, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event tpsv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex *a, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event tpsv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex *a, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trsv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const float *a, std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event trsv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const double *a, std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event trsv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event trsv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event copy( + char *libname, cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + float *y, std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event copy( + char *libname, cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + double *y, std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event copy( + char *libname, cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event copy( + char *libname, cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event hemv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event hemv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event iamax( + char *libname, cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event iamax( + char *libname, cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event iamax( + char *libname, cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event iamax( + char *libname, cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event sbmv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, + float alpha, const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, + float *y, std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event sbmv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, + double alpha, const double *a, std::int64_t lda, const double *x, std::int64_t incx, + double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event asum( + char *libname, cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event asum( + char *libname, cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event asum( + char *libname, cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + float *result, const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event asum( + char *libname, cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + double *result, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tbsv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event tbsv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event tbsv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const std::complex *a, std::int64_t lda, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event tbsv( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const std::complex *a, std::int64_t lda, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event spr2( + char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event spr2( + char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + const double *x, std::int64_t incx, const double *y, std::int64_t incy, double *a, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event rotm( + char *libname, cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, float *param, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event rotm( + char *libname, cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, + std::int64_t incy, double *param, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event dot(char *libname, cl::sycl::queue &queue, std::int64_t n, + const float *x, std::int64_t incx, const float *y, + std::int64_t incy, float *result, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event dot(char *libname, cl::sycl::queue &queue, std::int64_t n, + const double *x, std::int64_t incx, const double *y, + std::int64_t incy, double *result, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event dot(char *libname, cl::sycl::queue &queue, std::int64_t n, + const float *x, std::int64_t incx, const float *y, + std::int64_t incy, double *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event sdsdot( + char *libname, cl::sycl::queue &queue, std::int64_t n, float sb, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, float *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event her2k( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, float beta, std::complex *c, + std::int64_t ldc, const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event her2k( + char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, double beta, std::complex *c, + std::int64_t ldc, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event rotg( + char *libname, cl::sycl::queue &queue, float *a, float *b, float *c, float *s, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event rotg( + char *libname, cl::sycl::queue &queue, double *a, double *b, double *c, double *s, + const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event rotg( + char *libname, cl::sycl::queue &queue, std::complex *a, std::complex *b, float *c, + std::complex *s, const cl::sycl::vector_class &dependencies = {}); +ONEMKL_EXPORT cl::sycl::event rotg( + char *libname, cl::sycl::queue &queue, std::complex *a, std::complex *b, + double *c, std::complex *s, + const cl::sycl::vector_class &dependencies = {}); + } //namespace detail } //namespace blas } //namespace onemkl diff --git a/include/onemkl/blas/detail/cublas/blas_ct.hpp b/include/onemkl/blas/detail/cublas/blas_ct.hpp index 4ac19b7f8..a18b167ce 100644 --- a/include/onemkl/blas/detail/cublas/blas_ct.hpp +++ b/include/onemkl/blas/detail/cublas/blas_ct.hpp @@ -33,14 +33,13 @@ #include "onemkl_blas_cublas.hpp" +#include "onemkl/blas/detail/blas_ct_templates.hpp" + namespace onemkl { namespace blas { -template -static inline void syr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer &a, std::int64_t lda); +// Buffer APIs + template <> void syr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, @@ -52,11 +51,6 @@ void syr2(cl::sycl::queue &queue, uplo uppe syr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda); } -template -static inline void syr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer &a, std::int64_t lda); template <> void syr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, @@ -68,9 +62,6 @@ void syr2(cl::sycl::queue &queue, uplo uppe syr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda); } -template -static inline void scal(cl::sycl::queue &queue, std::int64_t n, float alpha, - cl::sycl::buffer &x, std::int64_t incx); template <> void scal(cl::sycl::queue &queue, std::int64_t n, float alpha, cl::sycl::buffer &x, std::int64_t incx) { @@ -79,9 +70,6 @@ void scal(cl::sycl::queue &queue, std::int6 scal_postcondition(queue, n, alpha, x, incx); } -template -static inline void scal(cl::sycl::queue &queue, std::int64_t n, double alpha, - cl::sycl::buffer &x, std::int64_t incx); template <> void scal(cl::sycl::queue &queue, std::int64_t n, double alpha, cl::sycl::buffer &x, std::int64_t incx) { @@ -90,9 +78,6 @@ void scal(cl::sycl::queue &queue, std::int6 scal_postcondition(queue, n, alpha, x, incx); } -template -static inline void scal(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &x, std::int64_t incx); template <> void scal(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, @@ -103,9 +88,6 @@ void scal(cl::sycl::queue &queue, std::int6 scal_postcondition(queue, n, alpha, x, incx); } -template -static inline void scal(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &x, std::int64_t incx); template <> void scal(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, @@ -116,9 +98,6 @@ void scal(cl::sycl::queue &queue, std::int6 scal_postcondition(queue, n, alpha, x, incx); } -template -static inline void scal(cl::sycl::queue &queue, std::int64_t n, float alpha, - cl::sycl::buffer, 1> &x, std::int64_t incx); template <> void scal(cl::sycl::queue &queue, std::int64_t n, float alpha, cl::sycl::buffer, 1> &x, @@ -128,9 +107,6 @@ void scal(cl::sycl::queue &queue, std::int6 scal_postcondition(queue, n, alpha, x, incx); } -template -static inline void scal(cl::sycl::queue &queue, std::int64_t n, double alpha, - cl::sycl::buffer, 1> &x, std::int64_t incx); template <> void scal(cl::sycl::queue &queue, std::int64_t n, double alpha, cl::sycl::buffer, 1> &x, @@ -140,10 +116,6 @@ void scal(cl::sycl::queue &queue, std::int6 scal_postcondition(queue, n, alpha, x, incx); } -template -static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx); template <> void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -154,10 +126,6 @@ void trmv(cl::sycl::queue &queue, uplo uppe trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -template -static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx); template <> void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -168,11 +136,6 @@ void trmv(cl::sycl::queue &queue, uplo uppe trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -template -static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx); template <> void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -185,11 +148,6 @@ void trmv(cl::sycl::queue &queue, uplo uppe trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -template -static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx); template <> void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -202,10 +160,6 @@ void trmv(cl::sycl::queue &queue, uplo uppe trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -template -static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer &a, - cl::sycl::buffer &x, std::int64_t incx); template <> void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -216,10 +170,6 @@ void tpmv(cl::sycl::queue &queue, uplo uppe tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -template -static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer &a, - cl::sycl::buffer &x, std::int64_t incx); template <> void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -230,10 +180,6 @@ void tpmv(cl::sycl::queue &queue, uplo uppe tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -template -static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &x, std::int64_t incx); template <> void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -245,10 +191,6 @@ void tpmv(cl::sycl::queue &queue, uplo uppe tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -template -static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &x, std::int64_t incx); template <> void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -260,10 +202,6 @@ void tpmv(cl::sycl::queue &queue, uplo uppe tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -template -static inline void spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &a); template <> void spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, @@ -274,10 +212,6 @@ void spr(cl::sycl::queue &queue, uplo upper spr_postcondition(queue, upper_lower, n, alpha, x, incx, a); } -template -static inline void spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &a); template <> void spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, @@ -288,12 +222,6 @@ void spr(cl::sycl::queue &queue, uplo upper spr_postcondition(queue, upper_lower, n, alpha, x, incx, a); } -template -static inline void hpmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, cl::sycl::buffer, 1> &y, - std::int64_t incy); template <> void hpmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, @@ -307,12 +235,6 @@ void hpmv(cl::sycl::queue &queue, uplo uppe hpmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy); } -template -static inline void hpmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, cl::sycl::buffer, 1> &y, - std::int64_t incy); template <> void hpmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, @@ -326,11 +248,6 @@ void hpmv(cl::sycl::queue &queue, uplo uppe hpmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy); } -template -static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, float alpha, cl::sycl::buffer &a, - std::int64_t lda, float beta, cl::sycl::buffer &c, - std::int64_t ldc); template <> void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -342,11 +259,6 @@ void syrk(cl::sycl::queue &queue, uplo uppe syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -template -static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, double alpha, cl::sycl::buffer &a, - std::int64_t lda, double beta, cl::sycl::buffer &c, - std::int64_t ldc); template <> void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -358,12 +270,6 @@ void syrk(cl::sycl::queue &queue, uplo uppe syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -template -static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void syrk( cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -374,12 +280,6 @@ void syrk( syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -template -static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void syrk( cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -390,12 +290,6 @@ void syrk( syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -template -static inline void her2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &x, - std::int64_t incx, cl::sycl::buffer, 1> &y, - std::int64_t incy, cl::sycl::buffer, 1> &a, - std::int64_t lda); template <> void her2( cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, @@ -407,12 +301,6 @@ void her2( her2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda); } -template -static inline void her2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &x, - std::int64_t incx, cl::sycl::buffer, 1> &y, - std::int64_t incy, cl::sycl::buffer, 1> &a, - std::int64_t lda); template <> void her2( cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, @@ -424,12 +312,6 @@ void her2( her2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda); } -template -static inline void hbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, - std::complex alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx, std::complex beta, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void hbmv( cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, @@ -441,12 +323,6 @@ void hbmv( hbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void hbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, - std::complex alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx, std::complex beta, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void hbmv( cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, @@ -458,11 +334,6 @@ void hbmv( hbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void rot(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy, float c, - float s); template <> void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -474,11 +345,6 @@ void rot(cl::sycl::queue &queue, std::int64 rot_postcondition(queue, n, x, incx, y, incy, c, s); } -template -static inline void rot(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy, double c, - double s); template <> void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -490,10 +356,6 @@ void rot(cl::sycl::queue &queue, std::int64 rot_postcondition(queue, n, x, incx, y, incy, c, s); } -template -static inline void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, float c, - float s); template <> void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -504,10 +366,6 @@ void rot(cl::sycl::queue &queue, std::int64 rot_postcondition(queue, n, x, incx, y, incy, c, s); } -template -static inline void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, - double c, double s); template <> void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -518,10 +376,6 @@ void rot(cl::sycl::queue &queue, std::int64 rot_postcondition(queue, n, x, incx, y, incy, c, s); } -template -static inline void axpy(cl::sycl::queue &queue, std::int64_t n, float alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &y, std::int64_t incy); template <> void axpy(cl::sycl::queue &queue, std::int64_t n, float alpha, cl::sycl::buffer &x, std::int64_t incx, @@ -531,10 +385,6 @@ void axpy(cl::sycl::queue &queue, std::int6 axpy_postcondition(queue, n, alpha, x, incx, y, incy); } -template -static inline void axpy(cl::sycl::queue &queue, std::int64_t n, double alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &y, std::int64_t incy); template <> void axpy(cl::sycl::queue &queue, std::int64_t n, double alpha, cl::sycl::buffer &x, std::int64_t incx, @@ -544,10 +394,6 @@ void axpy(cl::sycl::queue &queue, std::int6 axpy_postcondition(queue, n, alpha, x, incx, y, incy); } -template -static inline void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, @@ -560,10 +406,6 @@ void axpy(cl::sycl::queue &queue, std::int6 axpy_postcondition(queue, n, alpha, x, incx, y, incy); } -template -static inline void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, @@ -576,12 +418,6 @@ void axpy(cl::sycl::queue &queue, std::int6 axpy_postcondition(queue, n, alpha, x, incx, y, incy); } -template -static inline void gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &x, - std::int64_t incx, cl::sycl::buffer, 1> &y, - std::int64_t incy, cl::sycl::buffer, 1> &a, - std::int64_t lda); template <> void gerc( cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, @@ -593,12 +429,6 @@ void gerc( gerc_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda); } -template -static inline void gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &x, - std::int64_t incx, cl::sycl::buffer, 1> &y, - std::int64_t incy, cl::sycl::buffer, 1> &a, - std::int64_t lda); template <> void gerc( cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, @@ -610,11 +440,6 @@ void gerc( gerc_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda); } -template -static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, float alpha, cl::sycl::buffer &a, - std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, - float beta, cl::sycl::buffer &c, std::int64_t ldc); template <> void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -627,11 +452,6 @@ void syr2k(cl::sycl::queue &queue, uplo upp syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, double alpha, cl::sycl::buffer &a, - std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, - double beta, cl::sycl::buffer &c, std::int64_t ldc); template <> void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -644,13 +464,6 @@ void syr2k(cl::sycl::queue &queue, uplo upp syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void syr2k( cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -662,13 +475,6 @@ void syr2k( syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void syr2k( cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -680,11 +486,6 @@ void syr2k( syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - float alpha, cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx, float beta, - cl::sycl::buffer &y, std::int64_t incy); template <> void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha, @@ -697,11 +498,6 @@ void gemv(cl::sycl::queue &queue, transpose gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - double alpha, cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx, double beta, - cl::sycl::buffer &y, std::int64_t incy); template <> void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, @@ -714,12 +510,6 @@ void gemv(cl::sycl::queue &queue, transpose gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx, std::complex beta, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void gemv( cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, @@ -731,12 +521,6 @@ void gemv( gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx, std::complex beta, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void gemv( cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, @@ -748,10 +532,6 @@ void gemv( gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &a, std::int64_t lda); template <> void her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, @@ -764,10 +544,6 @@ void her(cl::sycl::queue &queue, uplo upper her_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda); } -template -static inline void her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &a, std::int64_t lda); template <> void her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, @@ -780,10 +556,6 @@ void her(cl::sycl::queue &queue, uplo upper her_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda); } -template -static inline void hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &a); template <> void hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, @@ -795,10 +567,6 @@ void hpr(cl::sycl::queue &queue, uplo upper hpr_postcondition(queue, upper_lower, n, alpha, x, incx, a); } -template -static inline void hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &a); template <> void hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, @@ -810,9 +578,6 @@ void hpr(cl::sycl::queue &queue, uplo upper hpr_postcondition(queue, upper_lower, n, alpha, x, incx, a); } -template -static inline void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &result); template <> void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -822,9 +587,6 @@ void iamin(cl::sycl::queue &queue, std::int iamin_postcondition(queue, n, x, incx, result); } -template -static inline void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &result); template <> void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -834,10 +596,6 @@ void iamin(cl::sycl::queue &queue, std::int iamin_postcondition(queue, n, x, incx, result); } -template -static inline void iamin(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer &result); template <> void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -848,10 +606,6 @@ void iamin(cl::sycl::queue &queue, std::int iamin_postcondition(queue, n, x, incx, result); } -template -static inline void iamin(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer &result); template <> void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -862,128 +616,6 @@ void iamin(cl::sycl::queue &queue, std::int iamin_postcondition(queue, n, x, incx, result); } -template -static inline void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, - cl::sycl::buffer &m, - cl::sycl::buffer &n, - cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, - cl::sycl::buffer &beta, cl::sycl::buffer &c, - cl::sycl::buffer &ldc, std::int64_t group_count, - cl::sycl::buffer &group_size); -template <> -void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, cl::sycl::buffer &beta, - cl::sycl::buffer &c, cl::sycl::buffer &ldc, std::int64_t group_count, - cl::sycl::buffer &group_size) { - gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); - onemkl::cublas::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); - gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); -} - -template -static inline void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, cl::sycl::buffer &beta, - cl::sycl::buffer &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size); -template <> -void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, cl::sycl::buffer &beta, - cl::sycl::buffer &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size) { - gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); - onemkl::cublas::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); - gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); -} - -template -static inline void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size); -template <> -void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size) { - gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); - onemkl::cublas::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); - gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); -} - -template -static inline void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size); -template <> -void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size) { - gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); - onemkl::cublas::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); - gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); -} - -template -static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - cl::sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, cl::sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, float beta, - cl::sycl::buffer &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size); template <> void gemm_batch( cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -999,14 +631,6 @@ void gemm_batch( stride_b, beta, c, ldc, stride_c, batch_size); } -template -static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, double alpha, - cl::sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, cl::sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, double beta, - cl::sycl::buffer &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size); template <> void gemm_batch( cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1022,15 +646,6 @@ void gemm_batch( stride_b, beta, c, ldc, stride_c, batch_size); } -template -static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, cl::sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - cl::sycl::buffer, 1> &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size); template <> void gemm_batch( cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1047,15 +662,6 @@ void gemm_batch( stride_b, beta, c, ldc, stride_c, batch_size); } -template -static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, cl::sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - cl::sycl::buffer, 1> &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size); template <> void gemm_batch( cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1072,11 +678,6 @@ void gemm_batch( stride_b, beta, c, ldc, stride_c, batch_size); } -template -static inline void spmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - cl::sycl::buffer &a, cl::sycl::buffer &x, - std::int64_t incx, float beta, cl::sycl::buffer &y, - std::int64_t incy); template <> void spmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, @@ -1089,11 +690,6 @@ void spmv(cl::sycl::queue &queue, uplo uppe spmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy); } -template -static inline void spmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - cl::sycl::buffer &a, cl::sycl::buffer &x, - std::int64_t incx, double beta, cl::sycl::buffer &y, - std::int64_t incy); template <> void spmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, @@ -1106,12 +702,6 @@ void spmv(cl::sycl::queue &queue, uplo uppe spmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy); } -template -static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb, float beta, - cl::sycl::buffer &c, std::int64_t ldc); template <> void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1125,13 +715,6 @@ void gemm_ext(cl::sycl::queue &queue, trans gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, - offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, cl::sycl::buffer &a, std::int64_t lda, - int8_t ao, cl::sycl::buffer &b, std::int64_t ldb, - uint8_t bo, float beta, cl::sycl::buffer &c, - std::int64_t ldc, cl::sycl::buffer &co); template <> void gemm_ext( cl::sycl::queue &queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, @@ -1146,12 +729,6 @@ void gemm_ext( beta, c, ldc, co); } -template -static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb, float beta, - cl::sycl::buffer &c, std::int64_t ldc); template <> void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1165,12 +742,6 @@ void gemm_ext(cl::sycl::queue &queue, trans gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, double alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb, double beta, - cl::sycl::buffer &c, std::int64_t ldc); template <> void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1184,13 +755,6 @@ void gemm_ext(cl::sycl::queue &queue, trans gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &b, - std::int64_t ldb, std::complex beta, - cl::sycl::buffer, 1> &c, std::int64_t ldc); template <> void gemm_ext( cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1202,14 +766,6 @@ void gemm_ext( gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void gemm_ext( cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1221,12 +777,6 @@ void gemm_ext( gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, half alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb, half beta, - cl::sycl::buffer &c, std::int64_t ldc); template <> void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1240,9 +790,6 @@ void gemm_ext(cl::sycl::queue &queue, trans gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy); template <> void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -1252,9 +799,6 @@ void swap(cl::sycl::queue &queue, std::int6 swap_postcondition(queue, n, x, incx, y, incy); } -template -static inline void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy); template <> void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -1264,10 +808,6 @@ void swap(cl::sycl::queue &queue, std::int6 swap_postcondition(queue, n, x, incx, y, incy); } -template -static inline void swap(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -1279,10 +819,6 @@ void swap(cl::sycl::queue &queue, std::int6 swap_postcondition(queue, n, x, incx, y, incy); } -template -static inline void swap(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -1294,12 +830,6 @@ void swap(cl::sycl::queue &queue, std::int6 swap_postcondition(queue, n, x, incx, y, incy); } -template -static inline void geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &x, - std::int64_t incx, cl::sycl::buffer, 1> &y, - std::int64_t incy, cl::sycl::buffer, 1> &a, - std::int64_t lda); template <> void geru( cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, @@ -1311,12 +841,6 @@ void geru( geru_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda); } -template -static inline void geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &x, - std::int64_t incx, cl::sycl::buffer, 1> &y, - std::int64_t incy, cl::sycl::buffer, 1> &a, - std::int64_t lda); template <> void geru( cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, @@ -1328,10 +852,6 @@ void geru( geru_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda); } -template -static inline void nrm2(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer &result); template <> void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -1342,10 +862,6 @@ void nrm2(cl::sycl::queue &queue, std::int6 nrm2_postcondition(queue, n, x, incx, result); } -template -static inline void nrm2(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer &result); template <> void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -1356,9 +872,6 @@ void nrm2(cl::sycl::queue &queue, std::int6 nrm2_postcondition(queue, n, x, incx, result); } -template -static inline void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &result); template <> void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -1368,9 +881,6 @@ void nrm2(cl::sycl::queue &queue, std::int6 nrm2_postcondition(queue, n, x, incx, result); } -template -static inline void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &result); template <> void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -1380,11 +890,6 @@ void nrm2(cl::sycl::queue &queue, std::int6 nrm2_postcondition(queue, n, x, incx, result); } -template -static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer &a, - std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, - float beta, cl::sycl::buffer &c, std::int64_t ldc); template <> void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1398,12 +903,6 @@ void gemm(cl::sycl::queue &queue, transpose gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, double alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb, double beta, - cl::sycl::buffer &c, std::int64_t ldc); template <> void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1417,13 +916,6 @@ void gemm(cl::sycl::queue &queue, transpose gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void gemm( cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1435,13 +927,6 @@ void gemm( gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void gemm( cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1453,11 +938,6 @@ void gemm( gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, half alpha, cl::sycl::buffer &a, - std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, half beta, - cl::sycl::buffer &c, std::int64_t ldc); template <> void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1471,11 +951,6 @@ void gemm(cl::sycl::queue &queue, transpose gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void herk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, float alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, float beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void herk( cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -1486,11 +961,6 @@ void herk( herk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -template -static inline void herk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, double alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, double beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void herk( cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -1501,11 +971,6 @@ void herk( herk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -template -static inline void ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer &a, std::int64_t lda); template <> void ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, @@ -1517,11 +982,6 @@ void ger(cl::sycl::queue &queue, std::int64 ger_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda); } -template -static inline void ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer &a, std::int64_t lda); template <> void ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, @@ -1533,11 +993,6 @@ void ger(cl::sycl::queue &queue, std::int64 ger_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda); } -template -static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb); template <> void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -1552,11 +1007,6 @@ void trsm(cl::sycl::queue &queue, side left ldb); } -template -static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb); template <> void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -1571,11 +1021,6 @@ void trsm(cl::sycl::queue &queue, side left ldb); } -template -static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb); template <> void trsm( cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -1590,11 +1035,6 @@ void trsm( ldb); } -template -static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb); template <> void trsm( cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -1609,11 +1049,6 @@ void trsm( ldb); } -template -static inline void dotu(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy, - cl::sycl::buffer, 1> &result); template <> void dotu(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -1626,11 +1061,6 @@ void dotu(cl::sycl::queue &queue, std::int6 dotu_postcondition(queue, n, x, incx, y, incy, result); } -template -static inline void dotu(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy, - cl::sycl::buffer, 1> &result); template <> void dotu(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -1643,13 +1073,6 @@ void dotu(cl::sycl::queue &queue, std::int6 dotu_postcondition(queue, n, x, incx, y, incy, result); } -template -static inline void hemm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void hemm( cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, @@ -1661,13 +1084,6 @@ void hemm( hemm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void hemm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void hemm( cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, @@ -1679,11 +1095,6 @@ void hemm( hemm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void hpr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &x, - std::int64_t incx, cl::sycl::buffer, 1> &y, - std::int64_t incy, cl::sycl::buffer, 1> &a); template <> void hpr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, @@ -1697,11 +1108,6 @@ void hpr2(cl::sycl::queue &queue, uplo uppe hpr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a); } -template -static inline void hpr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &x, - std::int64_t incx, cl::sycl::buffer, 1> &y, - std::int64_t incy, cl::sycl::buffer, 1> &a); template <> void hpr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, @@ -1715,12 +1121,6 @@ void hpr2(cl::sycl::queue &queue, uplo uppe hpr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a); } -template -static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::int64_t kl, std::int64_t ku, float alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx, float beta, - cl::sycl::buffer &y, std::int64_t incy); template <> void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, @@ -1734,12 +1134,6 @@ void gbmv(cl::sycl::queue &queue, transpose gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::int64_t kl, std::int64_t ku, double alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx, double beta, - cl::sycl::buffer &y, std::int64_t incy); template <> void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, @@ -1753,13 +1147,6 @@ void gbmv(cl::sycl::queue &queue, transpose gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::int64_t kl, std::int64_t ku, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, cl::sycl::buffer, 1> &y, - std::int64_t incy); template <> void gbmv( cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, @@ -1771,13 +1158,6 @@ void gbmv( gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::int64_t kl, std::int64_t ku, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, cl::sycl::buffer, 1> &y, - std::int64_t incy); template <> void gbmv( cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, @@ -1789,10 +1169,6 @@ void gbmv( gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, cl::sycl::buffer &a, - std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx); template <> void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -1804,10 +1180,6 @@ void tbmv(cl::sycl::queue &queue, uplo uppe tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -template -static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, cl::sycl::buffer &a, - std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx); template <> void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -1819,11 +1191,6 @@ void tbmv(cl::sycl::queue &queue, uplo uppe tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -template -static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx); template <> void tbmv( cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -1834,11 +1201,6 @@ void tbmv( tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -template -static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &x, std::int64_t incx); template <> void tbmv( cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -1849,11 +1211,6 @@ void tbmv( tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -template -static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, float alpha, cl::sycl::buffer &a, - std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, - float beta, cl::sycl::buffer &c, std::int64_t ldc); template <> void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, @@ -1866,11 +1223,6 @@ void symm(cl::sycl::queue &queue, side left symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, double alpha, cl::sycl::buffer &a, - std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, - double beta, cl::sycl::buffer &c, std::int64_t ldc); template <> void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, @@ -1883,13 +1235,6 @@ void symm(cl::sycl::queue &queue, side left symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void symm( cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, @@ -1901,13 +1246,6 @@ void symm( symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void symm( cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, @@ -1919,11 +1257,6 @@ void symm( symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void dotc(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy, - cl::sycl::buffer, 1> &result); template <> void dotc(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -1936,11 +1269,6 @@ void dotc(cl::sycl::queue &queue, std::int6 dotc_postcondition(queue, n, x, incx, y, incy, result); } -template -static inline void dotc(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy, - cl::sycl::buffer, 1> &result); template <> void dotc(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -1953,10 +1281,6 @@ void dotc(cl::sycl::queue &queue, std::int6 dotc_postcondition(queue, n, x, incx, y, incy, result); } -template -static inline void syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &a, std::int64_t lda); template <> void syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, @@ -1967,10 +1291,6 @@ void syr(cl::sycl::queue &queue, uplo upper syr_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda); } -template -static inline void syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &a, std::int64_t lda); template <> void syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, @@ -1981,11 +1301,6 @@ void syr(cl::sycl::queue &queue, uplo upper syr_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda); } -template -static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb); template <> void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -2000,11 +1315,6 @@ void trmm(cl::sycl::queue &queue, side left ldb); } -template -static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb); template <> void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -2019,11 +1329,6 @@ void trmm(cl::sycl::queue &queue, side left ldb); } -template -static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb); template <> void trmm( cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -2038,11 +1343,6 @@ void trmm( ldb); } -template -static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb); template <> void trmm( cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -2057,10 +1357,6 @@ void trmm( ldb); } -template -static inline void rotmg(cl::sycl::queue &queue, cl::sycl::buffer &d1, - cl::sycl::buffer &d2, cl::sycl::buffer &x1, float y1, - cl::sycl::buffer ¶m); template <> void rotmg(cl::sycl::queue &queue, cl::sycl::buffer &d1, @@ -2072,10 +1368,6 @@ void rotmg(cl::sycl::queue &queue, rotmg_postcondition(queue, d1, d2, x1, y1, param); } -template -static inline void rotmg(cl::sycl::queue &queue, cl::sycl::buffer &d1, - cl::sycl::buffer &d2, cl::sycl::buffer &x1, - double y1, cl::sycl::buffer ¶m); template <> void rotmg(cl::sycl::queue &queue, cl::sycl::buffer &d1, @@ -2087,10 +1379,6 @@ void rotmg(cl::sycl::queue &queue, rotmg_postcondition(queue, d1, d2, x1, y1, param); } -template -static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer &a, - cl::sycl::buffer &x, std::int64_t incx); template <> void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2101,10 +1389,6 @@ void tpsv(cl::sycl::queue &queue, uplo uppe tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -template -static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer &a, - cl::sycl::buffer &x, std::int64_t incx); template <> void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2115,10 +1399,6 @@ void tpsv(cl::sycl::queue &queue, uplo uppe tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -template -static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &x, std::int64_t incx); template <> void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2130,10 +1410,6 @@ void tpsv(cl::sycl::queue &queue, uplo uppe tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -template -static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &x, std::int64_t incx); template <> void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2145,10 +1421,6 @@ void tpsv(cl::sycl::queue &queue, uplo uppe tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -template -static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx); template <> void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2159,10 +1431,6 @@ void trsv(cl::sycl::queue &queue, uplo uppe trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -template -static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx); template <> void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2173,11 +1441,6 @@ void trsv(cl::sycl::queue &queue, uplo uppe trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -template -static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx); template <> void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2190,11 +1453,6 @@ void trsv(cl::sycl::queue &queue, uplo uppe trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -template -static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx); template <> void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2207,9 +1465,6 @@ void trsv(cl::sycl::queue &queue, uplo uppe trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -template -static inline void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy); template <> void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2219,9 +1474,6 @@ void copy(cl::sycl::queue &queue, std::int6 copy_postcondition(queue, n, x, incx, y, incy); } -template -static inline void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy); template <> void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2231,10 +1483,6 @@ void copy(cl::sycl::queue &queue, std::int6 copy_postcondition(queue, n, x, incx, y, incy); } -template -static inline void copy(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -2246,10 +1494,6 @@ void copy(cl::sycl::queue &queue, std::int6 copy_postcondition(queue, n, x, incx, y, incy); } -template -static inline void copy(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -2261,12 +1505,6 @@ void copy(cl::sycl::queue &queue, std::int6 copy_postcondition(queue, n, x, incx, y, incy); } -template -static inline void hemv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx, std::complex beta, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void hemv( cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, @@ -2278,12 +1516,6 @@ void hemv( hemv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void hemv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx, std::complex beta, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void hemv( cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, @@ -2295,12 +1527,6 @@ void hemv( hemv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, - transpose transb, std::int64_t n, std::int64_t k, float alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb, float beta, - cl::sycl::buffer &c, std::int64_t ldc); template <> void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, @@ -2317,12 +1543,6 @@ void gemmt(cl::sycl::queue &queue, uplo upp ldc); } -template -static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, - transpose transb, std::int64_t n, std::int64_t k, double alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb, double beta, - cl::sycl::buffer &c, std::int64_t ldc); template <> void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, @@ -2339,13 +1559,6 @@ void gemmt(cl::sycl::queue &queue, uplo upp ldc); } -template -static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, - transpose transb, std::int64_t n, std::int64_t k, - std::complex alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &b, - std::int64_t ldb, std::complex beta, - cl::sycl::buffer, 1> &c, std::int64_t ldc); template <> void gemmt( cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, @@ -2360,13 +1573,6 @@ void gemmt( ldc); } -template -static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, - transpose transb, std::int64_t n, std::int64_t k, - std::complex alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &b, - std::int64_t ldb, std::complex beta, - cl::sycl::buffer, 1> &c, std::int64_t ldc); template <> void gemmt( cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, @@ -2381,11 +1587,6 @@ void gemmt( ldc); } -template -static inline void sbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, - float alpha, cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx, float beta, - cl::sycl::buffer &y, std::int64_t incy); template <> void sbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, float alpha, @@ -2398,11 +1599,6 @@ void sbmv(cl::sycl::queue &queue, uplo uppe sbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void sbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, - double alpha, cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx, double beta, - cl::sycl::buffer &y, std::int64_t incy); template <> void sbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, double alpha, @@ -2415,10 +1611,6 @@ void sbmv(cl::sycl::queue &queue, uplo uppe sbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void asum(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer &result); template <> void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -2429,10 +1621,6 @@ void asum(cl::sycl::queue &queue, std::int6 asum_postcondition(queue, n, x, incx, result); } -template -static inline void asum(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer &result); template <> void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -2443,9 +1631,6 @@ void asum(cl::sycl::queue &queue, std::int6 asum_postcondition(queue, n, x, incx, result); } -template -static inline void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &result); template <> void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2455,9 +1640,6 @@ void asum(cl::sycl::queue &queue, std::int6 asum_postcondition(queue, n, x, incx, result); } -template -static inline void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &result); template <> void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2467,10 +1649,6 @@ void asum(cl::sycl::queue &queue, std::int6 asum_postcondition(queue, n, x, incx, result); } -template -static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, cl::sycl::buffer &a, - std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx); template <> void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2482,10 +1660,6 @@ void tbsv(cl::sycl::queue &queue, uplo uppe tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -template -static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, cl::sycl::buffer &a, - std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx); template <> void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2497,11 +1671,6 @@ void tbsv(cl::sycl::queue &queue, uplo uppe tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -template -static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx); template <> void tbsv( cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2512,11 +1681,6 @@ void tbsv( tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -template -static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &x, std::int64_t incx); template <> void tbsv( cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2527,11 +1691,6 @@ void tbsv( tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -template -static inline void spr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer &a); template <> void spr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, @@ -2543,11 +1702,6 @@ void spr2(cl::sycl::queue &queue, uplo uppe spr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a); } -template -static inline void spr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer &a); template <> void spr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, @@ -2559,9 +1713,6 @@ void spr2(cl::sycl::queue &queue, uplo uppe spr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a); } -template -static inline void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &result); template <> void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2571,9 +1722,6 @@ void iamax(cl::sycl::queue &queue, std::int iamax_postcondition(queue, n, x, incx, result); } -template -static inline void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &result); template <> void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2583,10 +1731,6 @@ void iamax(cl::sycl::queue &queue, std::int iamax_postcondition(queue, n, x, incx, result); } -template -static inline void iamax(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer &result); template <> void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -2597,10 +1741,6 @@ void iamax(cl::sycl::queue &queue, std::int iamax_postcondition(queue, n, x, incx, result); } -template -static inline void iamax(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer &result); template <> void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -2611,118 +1751,6 @@ void iamax(cl::sycl::queue &queue, std::int iamax_postcondition(queue, n, x, incx, result); } -template -static inline void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, - cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, - cl::sycl::buffer &m, - cl::sycl::buffer &n, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, std::int64_t group_count, - cl::sycl::buffer &group_size); -template <> -void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, std::int64_t group_count, - cl::sycl::buffer &group_size) { - trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); - onemkl::cublas::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, - lda, b, ldb, group_count, group_size); - trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); -} - -template -static inline void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size); -template <> -void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { - trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); - onemkl::cublas::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, - lda, b, ldb, group_count, group_size); - trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); -} - -template -static inline void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size); -template <> -void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { - trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); - onemkl::cublas::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, - lda, b, ldb, group_count, group_size); - trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); -} - -template -static inline void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size); -template <> -void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { - trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); - onemkl::cublas::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, - lda, b, ldb, group_count, group_size); - trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); -} - -template -static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - float alpha, cl::sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, cl::sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template <> void trsm_batch( cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -2737,12 +1765,6 @@ void trsm_batch( stride_a, b, ldb, stride_b, batch_size); } -template -static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - double alpha, cl::sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, cl::sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template <> void trsm_batch( cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -2757,13 +1779,6 @@ void trsm_batch( stride_a, b, ldb, stride_b, batch_size); } -template -static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, cl::sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template <> void trsm_batch( cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -2779,13 +1794,6 @@ void trsm_batch( stride_a, b, ldb, stride_b, batch_size); } -template -static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, cl::sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template <> void trsm_batch( cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -2801,10 +1809,6 @@ void trsm_batch( stride_a, b, ldb, stride_b, batch_size); } -template -static inline void rotm(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer ¶m); template <> void rotm(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2815,10 +1819,6 @@ void rotm(cl::sycl::queue &queue, std::int6 rotm_postcondition(queue, n, x, incx, y, incy, param); } -template -static inline void rotm(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer ¶m); template <> void rotm(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2829,10 +1829,6 @@ void rotm(cl::sycl::queue &queue, std::int6 rotm_postcondition(queue, n, x, incx, y, incy, param); } -template -static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer &a, - cl::sycl::buffer &b, cl::sycl::buffer &c, - cl::sycl::buffer &s); template <> void rotg(cl::sycl::queue &queue, cl::sycl::buffer &a, @@ -2844,10 +1840,6 @@ void rotg(cl::sycl::queue &queue, rotg_postcondition(queue, a, b, c, s); } -template -static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer &a, - cl::sycl::buffer &b, cl::sycl::buffer &c, - cl::sycl::buffer &s); template <> void rotg(cl::sycl::queue &queue, cl::sycl::buffer &a, @@ -2859,10 +1851,6 @@ void rotg(cl::sycl::queue &queue, rotg_postcondition(queue, a, b, c, s); } -template -static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &c, - cl::sycl::buffer, 1> &s); template <> void rotg(cl::sycl::queue &queue, cl::sycl::buffer, 1> &a, @@ -2874,11 +1862,6 @@ void rotg(cl::sycl::queue &queue, rotg_postcondition(queue, a, b, c, s); } -template -static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &b, - cl::sycl::buffer &c, - cl::sycl::buffer, 1> &s); template <> void rotg(cl::sycl::queue &queue, cl::sycl::buffer, 1> &a, @@ -2890,11 +1873,6 @@ void rotg(cl::sycl::queue &queue, rotg_postcondition(queue, a, b, c, s); } -template -static inline void sdsdot(cl::sycl::queue &queue, std::int64_t n, float sb, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer &result); template <> void sdsdot(cl::sycl::queue &queue, std::int64_t n, float sb, cl::sycl::buffer &x, std::int64_t incx, @@ -2905,12 +1883,6 @@ void sdsdot(cl::sycl::queue &queue, std::in sdsdot_postcondition(queue, n, sb, x, incx, y, incy, result); } -template -static inline void her2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, float beta, - cl::sycl::buffer, 1> &c, std::int64_t ldc); template <> void her2k( cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -2922,13 +1894,6 @@ void her2k( her2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void her2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, - double beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void her2k( cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -2940,10 +1905,6 @@ void her2k( her2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer &result); template <> void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2954,10 +1915,6 @@ void dot(cl::sycl::queue &queue, std::int64 dot_postcondition(queue, n, x, incx, y, incy, result); } -template -static inline void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer &result); template <> void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2968,10 +1925,6 @@ void dot(cl::sycl::queue &queue, std::int64 dot_postcondition(queue, n, x, incx, y, incy, result); } -template -static inline void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer &result); template <> void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2982,11 +1935,6 @@ void dot(cl::sycl::queue &queue, std::int64 dot_postcondition(queue, n, x, incx, y, incy, result); } -template -static inline void symv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx, float beta, - cl::sycl::buffer &y, std::int64_t incy); template <> void symv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, @@ -2999,11 +1947,6 @@ void symv(cl::sycl::queue &queue, uplo uppe symv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void symv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx, double beta, - cl::sycl::buffer &y, std::int64_t incy); template <> void symv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, @@ -3016,6 +1959,2068 @@ void symv(cl::sycl::queue &queue, uplo uppe symv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } +// USM APIs + +template <> +cl::sycl::event syr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + syr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = + onemkl::cublas::syr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + syr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event syr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x, + std::int64_t incx, const double *y, std::int64_t incy, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + syr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = + onemkl::cublas::syr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + syr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, float alpha, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + scal_precondition(queue, n, alpha, x, incx, dependencies); + auto done = onemkl::cublas::scal(queue, n, alpha, x, incx, dependencies); + scal_postcondition(queue, n, alpha, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, double alpha, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + scal_precondition(queue, n, alpha, x, incx, dependencies); + auto done = onemkl::cublas::scal(queue, n, alpha, x, incx, dependencies); + scal_postcondition(queue, n, alpha, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, std::complex alpha, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies) { + scal_precondition(queue, n, alpha, x, incx, dependencies); + auto done = onemkl::cublas::scal(queue, n, alpha, x, incx, dependencies); + scal_postcondition(queue, n, alpha, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, std::complex alpha, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies) { + scal_precondition(queue, n, alpha, x, incx, dependencies); + auto done = onemkl::cublas::scal(queue, n, alpha, x, incx, dependencies); + scal_postcondition(queue, n, alpha, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, float alpha, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + scal_precondition(queue, n, alpha, x, incx, dependencies); + auto done = onemkl::cublas::scal(queue, n, alpha, x, incx, dependencies); + scal_postcondition(queue, n, alpha, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, double alpha, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies) { + scal_precondition(queue, n, alpha, x, incx, dependencies); + auto done = onemkl::cublas::scal(queue, n, alpha, x, incx, dependencies); + scal_postcondition(queue, n, alpha, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event trmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const float *a, std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = onemkl::cublas::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); + trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event trmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const double *a, std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = onemkl::cublas::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); + trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event trmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = onemkl::cublas::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); + trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event trmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = onemkl::cublas::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); + trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tpmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const float *a, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = + onemkl::cublas::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tpmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const double *a, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = + onemkl::cublas::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tpmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = + onemkl::cublas::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tpmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = + onemkl::cublas::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event spr( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x, + std::int64_t incx, float *a, const cl::sycl::vector_class &dependencies) { + spr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + auto done = onemkl::cublas::spr(queue, upper_lower, n, alpha, x, incx, a, dependencies); + spr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + return done; +} + +template <> +cl::sycl::event spr( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x, + std::int64_t incx, double *a, const cl::sycl::vector_class &dependencies) { + spr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + auto done = onemkl::cublas::spr(queue, upper_lower, n, alpha, x, incx, a, dependencies); + spr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + return done; +} + +template <> +cl::sycl::event hpmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *a, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + hpmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + auto done = + onemkl::cublas::hpmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + hpmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event hpmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *a, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + hpmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + auto done = + onemkl::cublas::hpmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + hpmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event syrk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + float alpha, const float *a, std::int64_t lda, float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + auto done = onemkl::cublas::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, + dependencies); + syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + return done; +} + +template <> +cl::sycl::event syrk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + double alpha, const double *a, std::int64_t lda, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + auto done = onemkl::cublas::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, + dependencies); + syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + return done; +} + +template <> +cl::sycl::event syrk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + auto done = onemkl::cublas::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, + dependencies); + syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + return done; +} + +template <> +cl::sycl::event syrk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + auto done = onemkl::cublas::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, + dependencies); + syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + return done; +} + +template <> +cl::sycl::event her2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + her2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = + onemkl::cublas::her2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + her2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event her2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + her2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = + onemkl::cublas::her2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + her2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event hbmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + hbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = onemkl::cublas::hbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, + incy, dependencies); + hbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +template <> +cl::sycl::event hbmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + hbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = onemkl::cublas::hbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, + incy, dependencies); + hbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +template <> +cl::sycl::event rot( + cl::sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, float c, float s, + const cl::sycl::vector_class &dependencies) { + rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies); + auto done = onemkl::cublas::rot(queue, n, x, incx, y, incy, c, s, dependencies); + rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies); + return done; +} + +template <> +cl::sycl::event rot( + cl::sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, double c, double s, + const cl::sycl::vector_class &dependencies) { + rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies); + auto done = onemkl::cublas::rot(queue, n, x, incx, y, incy, c, s, dependencies); + rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies); + return done; +} + +template <> +cl::sycl::event rot( + cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, float c, float s, + const cl::sycl::vector_class &dependencies) { + rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies); + auto done = onemkl::cublas::rot(queue, n, x, incx, y, incy, c, s, dependencies); + rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies); + return done; +} + +template <> +cl::sycl::event rot( + cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, + std::int64_t incy, double c, double s, + const cl::sycl::vector_class &dependencies) { + rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies); + auto done = onemkl::cublas::rot(queue, n, x, incx, y, incy, c, s, dependencies); + rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies); + return done; +} + +template <> +cl::sycl::event axpy( + cl::sycl::queue &queue, std::int64_t n, float alpha, const float *x, std::int64_t incx, + float *y, std::int64_t incy, const cl::sycl::vector_class &dependencies) { + axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies); + auto done = onemkl::cublas::axpy(queue, n, alpha, x, incx, y, incy, dependencies); + axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event axpy( + cl::sycl::queue &queue, std::int64_t n, double alpha, const double *x, std::int64_t incx, + double *y, std::int64_t incy, const cl::sycl::vector_class &dependencies) { + axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies); + auto done = onemkl::cublas::axpy(queue, n, alpha, x, incx, y, incy, dependencies); + axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event axpy( + cl::sycl::queue &queue, std::int64_t n, std::complex alpha, const std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies); + auto done = onemkl::cublas::axpy(queue, n, alpha, x, incx, y, incy, dependencies); + axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event axpy( + cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies); + auto done = onemkl::cublas::axpy(queue, n, alpha, x, incx, y, incy, dependencies); + axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event axpy_batch( + cl::sycl::queue &queue, std::int64_t *n, float *alpha, const float **x, std::int64_t *incx, + float **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + auto done = onemkl::cublas::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count, + group_size, dependencies); + axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + return done; +} + +template <> +cl::sycl::event axpy_batch( + cl::sycl::queue &queue, std::int64_t *n, double *alpha, const double **x, std::int64_t *incx, + double **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + auto done = onemkl::cublas::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count, + group_size, dependencies); + axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + return done; +} + +template <> +cl::sycl::event axpy_batch( + cl::sycl::queue &queue, std::int64_t *n, std::complex *alpha, + const std::complex **x, std::int64_t *incx, std::complex **y, std::int64_t *incy, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + auto done = onemkl::cublas::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count, + group_size, dependencies); + axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + return done; +} + +template <> +cl::sycl::event axpy_batch( + cl::sycl::queue &queue, std::int64_t *n, std::complex *alpha, + const std::complex **x, std::int64_t *incx, std::complex **y, + std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + auto done = onemkl::cublas::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count, + group_size, dependencies); + axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + return done; +} + +template <> +cl::sycl::event gerc( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + gerc_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = onemkl::cublas::gerc(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + gerc_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event gerc( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + gerc_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = onemkl::cublas::gerc(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + gerc_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event syr2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, float beta, + float *c, std::int64_t ldc, const cl::sycl::vector_class &dependencies) { + syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::cublas::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, + c, ldc, dependencies); + syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event syr2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + double alpha, const double *a, std::int64_t lda, const double *b, std::int64_t ldb, double beta, + double *c, std::int64_t ldc, const cl::sycl::vector_class &dependencies) { + syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::cublas::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, + c, ldc, dependencies); + syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event syr2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::cublas::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, + c, ldc, dependencies); + syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event syr2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::cublas::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, + c, ldc, dependencies); + syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event gemv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha, + const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies) { + gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = onemkl::cublas::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event gemv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, + const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies) { + gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = onemkl::cublas::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event gemv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = onemkl::cublas::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event gemv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = onemkl::cublas::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event her( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const std::complex *x, std::int64_t incx, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + her_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + auto done = onemkl::cublas::her(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + her_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event her( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + const std::complex *x, std::int64_t incx, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + her_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + auto done = onemkl::cublas::her(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + her_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event hpr( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const std::complex *x, std::int64_t incx, std::complex *a, + const cl::sycl::vector_class &dependencies) { + hpr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + auto done = onemkl::cublas::hpr(queue, upper_lower, n, alpha, x, incx, a, dependencies); + hpr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + return done; +} + +template <> +cl::sycl::event hpr( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + const std::complex *x, std::int64_t incx, std::complex *a, + const cl::sycl::vector_class &dependencies) { + hpr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + auto done = onemkl::cublas::hpr(queue, upper_lower, n, alpha, x, incx, a, dependencies); + hpr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + return done; +} + +template <> +cl::sycl::event iamin( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { + iamin_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::cublas::iamin(queue, n, x, incx, result, dependencies); + iamin_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event iamin( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies) { + iamin_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::cublas::iamin(queue, n, x, incx, result, dependencies); + iamin_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event iamin( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies) { + iamin_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::cublas::iamin(queue, n, x, incx, result, dependencies); + iamin_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event iamin( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies) { + iamin_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::cublas::iamin(queue, n, x, incx, result, dependencies); + iamin_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, float *alpha, const float **a, std::int64_t *lda, const float **b, + std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, std::int64_t group_count, + std::int64_t *group_size, const cl::sycl::vector_class &dependencies) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + auto done = onemkl::cublas::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, + beta, c, ldc, group_count, group_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + return done; +} + +template <> +cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, double *alpha, const double **a, std::int64_t *lda, const double **b, + std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc, std::int64_t group_count, + std::int64_t *group_size, const cl::sycl::vector_class &dependencies) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + auto done = onemkl::cublas::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, + beta, c, ldc, group_count, group_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + return done; +} + +template <> +cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, std::complex *alpha, const std::complex **a, std::int64_t *lda, + const std::complex **b, std::int64_t *ldb, std::complex *beta, + std::complex **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + auto done = onemkl::cublas::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, + beta, c, ldc, group_count, group_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + return done; +} + +template <> +cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, std::complex *alpha, const std::complex **a, std::int64_t *lda, + const std::complex **b, std::int64_t *ldb, std::complex *beta, + std::complex **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + auto done = onemkl::cublas::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, + beta, c, ldc, group_count, group_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + return done; +} + +template <> +cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const float *a, std::int64_t lda, std::int64_t stride_a, + const float *b, std::int64_t ldb, std::int64_t stride_b, float beta, float *c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const cl::sycl::vector_class &dependencies) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + auto done = + onemkl::cublas::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + return done; +} + +template <> +cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, double alpha, const double *a, std::int64_t lda, std::int64_t stride_a, + const double *b, std::int64_t ldb, std::int64_t stride_b, double beta, double *c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const cl::sycl::vector_class &dependencies) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + auto done = + onemkl::cublas::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + return done; +} + +template <> +cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + std::int64_t stride_a, const std::complex *b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, std::complex *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const cl::sycl::vector_class &dependencies) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + auto done = + onemkl::cublas::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + return done; +} + +template <> +cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + std::int64_t stride_a, const std::complex *b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, std::complex *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const cl::sycl::vector_class &dependencies) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + auto done = + onemkl::cublas::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + return done; +} + +template <> +cl::sycl::event spmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *a, + const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + spmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + auto done = + onemkl::cublas::spmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + spmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event spmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *a, + const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + spmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + auto done = + onemkl::cublas::spmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + spmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event swap( + cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies) { + swap_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = onemkl::cublas::swap(queue, n, x, incx, y, incy, dependencies); + swap_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event swap( + cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies) { + swap_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = onemkl::cublas::swap(queue, n, x, incx, y, incy, dependencies); + swap_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event swap( + cl::sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + swap_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = onemkl::cublas::swap(queue, n, x, incx, y, incy, dependencies); + swap_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event swap( + cl::sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + swap_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = onemkl::cublas::swap(queue, n, x, incx, y, incy, dependencies); + swap_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event geru( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + geru_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = onemkl::cublas::geru(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + geru_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event geru( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + geru_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = onemkl::cublas::geru(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + geru_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event nrm2( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + float *result, const cl::sycl::vector_class &dependencies) { + nrm2_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::cublas::nrm2(queue, n, x, incx, result, dependencies); + nrm2_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event nrm2( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + double *result, const cl::sycl::vector_class &dependencies) { + nrm2_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::cublas::nrm2(queue, n, x, incx, result, dependencies); + nrm2_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event nrm2( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies) { + nrm2_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::cublas::nrm2(queue, n, x, incx, result, dependencies); + nrm2_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event nrm2( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies) { + nrm2_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::cublas::nrm2(queue, n, x, incx, result, dependencies); + nrm2_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event gemm( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, + float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::cublas::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event gemm( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *b, + std::int64_t ldb, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::cublas::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event gemm( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::cublas::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event gemm( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::cublas::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event herk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + float alpha, const std::complex *a, std::int64_t lda, float beta, std::complex *c, + std::int64_t ldc, const cl::sycl::vector_class &dependencies) { + herk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + auto done = onemkl::cublas::herk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, + dependencies); + herk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + return done; +} + +template <> +cl::sycl::event herk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + double alpha, const std::complex *a, std::int64_t lda, double beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + herk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + auto done = onemkl::cublas::herk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, + dependencies); + herk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + return done; +} + +template <> +cl::sycl::event ger( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + ger_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = onemkl::cublas::ger(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + ger_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event ger( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, const double *x, + std::int64_t incx, const double *y, std::int64_t incy, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + ger_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = onemkl::cublas::ger(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + ger_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event trsm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b, + std::int64_t ldb, const cl::sycl::vector_class &dependencies) { + trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = onemkl::cublas::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +template <> +cl::sycl::event trsm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b, + std::int64_t ldb, const cl::sycl::vector_class &dependencies) { + trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = onemkl::cublas::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +template <> +cl::sycl::event trsm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = onemkl::cublas::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +template <> +cl::sycl::event trsm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = onemkl::cublas::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +template <> +cl::sycl::event dotu( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies) { + dotu_precondition(queue, n, x, incx, y, incy, result, dependencies); + auto done = onemkl::cublas::dotu(queue, n, x, incx, y, incy, result, dependencies); + dotu_postcondition(queue, n, x, incx, y, incy, result, dependencies); + return done; +} + +template <> +cl::sycl::event dotu( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies) { + dotu_precondition(queue, n, x, incx, y, incy, result, dependencies); + auto done = onemkl::cublas::dotu(queue, n, x, incx, y, incy, result, dependencies); + dotu_postcondition(queue, n, x, incx, y, incy, result, dependencies); + return done; +} + +template <> +cl::sycl::event hemm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + hemm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::cublas::hemm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, + beta, c, ldc, dependencies); + hemm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event hemm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + hemm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::cublas::hemm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, + beta, c, ldc, dependencies); + hemm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event hpr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, + const cl::sycl::vector_class &dependencies) { + hpr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + auto done = + onemkl::cublas::hpr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + hpr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + return done; +} + +template <> +cl::sycl::event hpr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, + const cl::sycl::vector_class &dependencies) { + hpr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + auto done = + onemkl::cublas::hpr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + hpr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + return done; +} + +template <> +cl::sycl::event gbmv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, float alpha, const float *a, std::int64_t lda, const float *x, + std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = onemkl::cublas::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, + incy, dependencies); + gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +template <> +cl::sycl::event gbmv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, double alpha, const double *a, std::int64_t lda, const double *x, + std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = onemkl::cublas::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, + incy, dependencies); + gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +template <> +cl::sycl::event gbmv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = onemkl::cublas::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, + incy, dependencies); + gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +template <> +cl::sycl::event gbmv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = onemkl::cublas::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, + incy, dependencies); + gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +template <> +cl::sycl::event tbmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = onemkl::cublas::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, + dependencies); + tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tbmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = onemkl::cublas::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, + dependencies); + tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tbmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies) { + tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = onemkl::cublas::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, + dependencies); + tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tbmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies) { + tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = onemkl::cublas::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, + dependencies); + tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event symm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, float beta, + float *c, std::int64_t ldc, const cl::sycl::vector_class &dependencies) { + symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::cublas::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, + beta, c, ldc, dependencies); + symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event symm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + double alpha, const double *a, std::int64_t lda, const double *b, std::int64_t ldb, double beta, + double *c, std::int64_t ldc, const cl::sycl::vector_class &dependencies) { + symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::cublas::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, + beta, c, ldc, dependencies); + symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event symm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::cublas::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, + beta, c, ldc, dependencies); + symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event symm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::cublas::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, + beta, c, ldc, dependencies); + symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event dotc( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies) { + dotc_precondition(queue, n, x, incx, y, incy, result, dependencies); + auto done = onemkl::cublas::dotc(queue, n, x, incx, y, incy, result, dependencies); + dotc_postcondition(queue, n, x, incx, y, incy, result, dependencies); + return done; +} + +template <> +cl::sycl::event dotc( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies) { + dotc_precondition(queue, n, x, incx, y, incy, result, dependencies); + auto done = onemkl::cublas::dotc(queue, n, x, incx, y, incy, result, dependencies); + dotc_postcondition(queue, n, x, incx, y, incy, result, dependencies); + return done; +} + +template <> +cl::sycl::event syr( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x, + std::int64_t incx, float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + syr_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + auto done = onemkl::cublas::syr(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + syr_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event syr( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x, + std::int64_t incx, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + syr_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + auto done = onemkl::cublas::syr(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + syr_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event trmm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b, + std::int64_t ldb, const cl::sycl::vector_class &dependencies) { + trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = onemkl::cublas::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +template <> +cl::sycl::event trmm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b, + std::int64_t ldb, const cl::sycl::vector_class &dependencies) { + trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = onemkl::cublas::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +template <> +cl::sycl::event trmm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = onemkl::cublas::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +template <> +cl::sycl::event trmm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = onemkl::cublas::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +template <> +cl::sycl::event rotmg( + cl::sycl::queue &queue, float *d1, float *d2, float *x1, float y1, float *param, + const cl::sycl::vector_class &dependencies) { + rotmg_precondition(queue, d1, d2, x1, y1, param, dependencies); + auto done = onemkl::cublas::rotmg(queue, d1, d2, x1, y1, param, dependencies); + rotmg_postcondition(queue, d1, d2, x1, y1, param, dependencies); + return done; +} + +template <> +cl::sycl::event rotmg( + cl::sycl::queue &queue, double *d1, double *d2, double *x1, double y1, double *param, + const cl::sycl::vector_class &dependencies) { + rotmg_precondition(queue, d1, d2, x1, y1, param, dependencies); + auto done = onemkl::cublas::rotmg(queue, d1, d2, x1, y1, param, dependencies); + rotmg_postcondition(queue, d1, d2, x1, y1, param, dependencies); + return done; +} + +template <> +cl::sycl::event tpsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const float *a, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = + onemkl::cublas::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tpsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const double *a, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = + onemkl::cublas::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tpsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = + onemkl::cublas::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tpsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = + onemkl::cublas::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event trsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const float *a, std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = onemkl::cublas::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); + trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event trsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const double *a, std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = onemkl::cublas::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); + trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event trsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = onemkl::cublas::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); + trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event trsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = onemkl::cublas::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); + trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event copy( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies) { + copy_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = onemkl::cublas::copy(queue, n, x, incx, y, incy, dependencies); + copy_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event copy( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies) { + copy_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = onemkl::cublas::copy(queue, n, x, incx, y, incy, dependencies); + copy_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event copy( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + copy_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = onemkl::cublas::copy(queue, n, x, incx, y, incy, dependencies); + copy_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event copy( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + copy_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = onemkl::cublas::copy(queue, n, x, incx, y, incy, dependencies); + copy_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event hemv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + hemv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = onemkl::cublas::hemv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + hemv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event hemv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *x, + std::int64_t incx, std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + hemv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = onemkl::cublas::hemv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + hemv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event gemmt( + cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, + float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + auto done = onemkl::cublas::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, + ldb, beta, c, ldc, dependencies); + gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + return done; +} + +template <> +cl::sycl::event gemmt( + cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *b, + std::int64_t ldb, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + auto done = onemkl::cublas::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, + ldb, beta, c, ldc, dependencies); + gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + return done; +} + +template <> +cl::sycl::event gemmt( + cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + auto done = onemkl::cublas::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, + ldb, beta, c, ldc, dependencies); + gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + return done; +} + +template <> +cl::sycl::event gemmt( + cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + auto done = onemkl::cublas::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, + ldb, beta, c, ldc, dependencies); + gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + return done; +} + +template <> +cl::sycl::event sbmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, float alpha, + const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies) { + sbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = onemkl::cublas::sbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, + incy, dependencies); + sbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +template <> +cl::sycl::event sbmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, double alpha, + const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies) { + sbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = onemkl::cublas::sbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, + incy, dependencies); + sbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +template <> +cl::sycl::event asum( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + float *result, const cl::sycl::vector_class &dependencies) { + asum_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::cublas::asum(queue, n, x, incx, result, dependencies); + asum_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event asum( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + double *result, const cl::sycl::vector_class &dependencies) { + asum_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::cublas::asum(queue, n, x, incx, result, dependencies); + asum_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event asum( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies) { + asum_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::cublas::asum(queue, n, x, incx, result, dependencies); + asum_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event asum( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies) { + asum_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::cublas::asum(queue, n, x, incx, result, dependencies); + asum_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event tbsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = onemkl::cublas::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, + dependencies); + tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tbsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = onemkl::cublas::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, + dependencies); + tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tbsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies) { + tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = onemkl::cublas::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, + dependencies); + tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tbsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies) { + tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = onemkl::cublas::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, + dependencies); + tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event spr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, float *a, + const cl::sycl::vector_class &dependencies) { + spr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + auto done = + onemkl::cublas::spr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + spr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + return done; +} + +template <> +cl::sycl::event spr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x, + std::int64_t incx, const double *y, std::int64_t incy, double *a, + const cl::sycl::vector_class &dependencies) { + spr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + auto done = + onemkl::cublas::spr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + spr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + return done; +} + +template <> +cl::sycl::event iamax( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { + iamax_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::cublas::iamax(queue, n, x, incx, result, dependencies); + iamax_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event iamax( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies) { + iamax_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::cublas::iamax(queue, n, x, incx, result, dependencies); + iamax_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event iamax( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies) { + iamax_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::cublas::iamax(queue, n, x, incx, result, dependencies); + iamax_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event iamax( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies) { + iamax_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::cublas::iamax(queue, n, x, incx, result, dependencies); + iamax_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event rotm( + cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, float *param, const cl::sycl::vector_class &dependencies) { + rotm_precondition(queue, n, x, incx, y, incy, param, dependencies); + auto done = onemkl::cublas::rotm(queue, n, x, incx, y, incy, param, dependencies); + rotm_postcondition(queue, n, x, incx, y, incy, param, dependencies); + return done; +} + +template <> +cl::sycl::event rotm( + cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, + std::int64_t incy, double *param, const cl::sycl::vector_class &dependencies) { + rotm_precondition(queue, n, x, incx, y, incy, param, dependencies); + auto done = onemkl::cublas::rotm(queue, n, x, incx, y, incy, param, dependencies); + rotm_postcondition(queue, n, x, incx, y, incy, param, dependencies); + return done; +} + +template <> +cl::sycl::event rotg( + cl::sycl::queue &queue, float *a, float *b, float *c, float *s, + const cl::sycl::vector_class &dependencies) { + rotg_precondition(queue, a, b, c, s, dependencies); + auto done = onemkl::cublas::rotg(queue, a, b, c, s, dependencies); + rotg_postcondition(queue, a, b, c, s, dependencies); + return done; +} + +template <> +cl::sycl::event rotg( + cl::sycl::queue &queue, double *a, double *b, double *c, double *s, + const cl::sycl::vector_class &dependencies) { + rotg_precondition(queue, a, b, c, s, dependencies); + auto done = onemkl::cublas::rotg(queue, a, b, c, s, dependencies); + rotg_postcondition(queue, a, b, c, s, dependencies); + return done; +} + +template <> +cl::sycl::event rotg( + cl::sycl::queue &queue, std::complex *a, std::complex *b, float *c, + std::complex *s, const cl::sycl::vector_class &dependencies) { + rotg_precondition(queue, a, b, c, s, dependencies); + auto done = onemkl::cublas::rotg(queue, a, b, c, s, dependencies); + rotg_postcondition(queue, a, b, c, s, dependencies); + return done; +} + +template <> +cl::sycl::event rotg( + cl::sycl::queue &queue, std::complex *a, std::complex *b, double *c, + std::complex *s, const cl::sycl::vector_class &dependencies) { + rotg_precondition(queue, a, b, c, s, dependencies); + auto done = onemkl::cublas::rotg(queue, a, b, c, s, dependencies); + rotg_postcondition(queue, a, b, c, s, dependencies); + return done; +} + +template <> +cl::sycl::event sdsdot( + cl::sycl::queue &queue, std::int64_t n, float sb, const float *x, std::int64_t incx, + const float *y, std::int64_t incy, float *result, + const cl::sycl::vector_class &dependencies) { + sdsdot_precondition(queue, n, sb, x, incx, y, incy, result, dependencies); + auto done = onemkl::cublas::sdsdot(queue, n, sb, x, incx, y, incy, result, dependencies); + sdsdot_postcondition(queue, n, sb, x, incx, y, incy, result, dependencies); + return done; +} + +template <> +cl::sycl::event her2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, float beta, std::complex *c, + std::int64_t ldc, const cl::sycl::vector_class &dependencies) { + her2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::cublas::her2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, + c, ldc, dependencies); + her2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event her2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, double beta, std::complex *c, + std::int64_t ldc, const cl::sycl::vector_class &dependencies) { + her2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::cublas::her2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, + c, ldc, dependencies); + her2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event dot( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, const float *y, + std::int64_t incy, float *result, const cl::sycl::vector_class &dependencies) { + dot_precondition(queue, n, x, incx, y, incy, result, dependencies); + auto done = onemkl::cublas::dot(queue, n, x, incx, y, incy, result, dependencies); + dot_postcondition(queue, n, x, incx, y, incy, result, dependencies); + return done; +} + +template <> +cl::sycl::event dot( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, const double *y, + std::int64_t incy, double *result, + const cl::sycl::vector_class &dependencies) { + dot_precondition(queue, n, x, incx, y, incy, result, dependencies); + auto done = onemkl::cublas::dot(queue, n, x, incx, y, incy, result, dependencies); + dot_postcondition(queue, n, x, incx, y, incy, result, dependencies); + return done; +} + +template <> +cl::sycl::event dot( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, const float *y, + std::int64_t incy, double *result, + const cl::sycl::vector_class &dependencies) { + dot_precondition(queue, n, x, incx, y, incy, result, dependencies); + auto done = onemkl::cublas::dot(queue, n, x, incx, y, incy, result, dependencies); + dot_postcondition(queue, n, x, incx, y, incy, result, dependencies); + return done; +} + +template <> +cl::sycl::event symv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *a, + std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + symv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = onemkl::cublas::symv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + symv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event symv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *a, + std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + symv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = onemkl::cublas::symv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + symv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + } //namespace blas } //namespace onemkl diff --git a/include/onemkl/blas/detail/cublas/onemkl_blas_cublas.hpp b/include/onemkl/blas/detail/cublas/onemkl_blas_cublas.hpp index 2336bbbd4..91fcfefcb 100644 --- a/include/onemkl/blas/detail/cublas/onemkl_blas_cublas.hpp +++ b/include/onemkl/blas/detail/cublas/onemkl_blas_cublas.hpp @@ -31,7 +31,8 @@ using onemkl::side; using onemkl::transpose; using onemkl::uplo; namespace cublas { -// Level 1 + +// Buffer APIs void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer &result); @@ -206,8 +207,6 @@ void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy); -// Level 2 - void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx, float beta, @@ -491,8 +490,6 @@ void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_d cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &x, std::int64_t incx); -// Level 3 - void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, float beta, @@ -647,7 +644,6 @@ void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose t diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &b, std::int64_t ldb); -// Batch API void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer &transa, cl::sycl::buffer &transb, cl::sycl::buffer &m, @@ -776,8 +772,6 @@ void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, trans std::int64_t stride_a, cl::sycl::buffer, 1> &b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); -// BLAS-like extensions - void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, float beta, @@ -841,6 +835,843 @@ void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, std::i std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, half beta, cl::sycl::buffer &c, std::int64_t ldc); +// USM APIs + +cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + float *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + double *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, float alpha, const float *x, + std::int64_t incx, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, double alpha, const double *x, + std::int64_t incx, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, float *alpha, const float **x, + std::int64_t *incx, float **y, std::int64_t *incy, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, double *alpha, const double **x, + std::int64_t *incx, double **y, std::int64_t *incy, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, std::complex *alpha, + const std::complex **x, std::int64_t *incx, + std::complex **y, std::int64_t *incy, std::int64_t group_count, + std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, std::complex *alpha, + const std::complex **x, std::int64_t *incx, + std::complex **y, std::int64_t *incy, std::int64_t group_count, + std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + const float *y, std::int64_t incy, float *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + const double *y, std::int64_t incy, double *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + const float *y, std::int64_t incy, double *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event dotc(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event dotc(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event dotu(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event dotu(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + std::int64_t *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + std::int64_t *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + std::int64_t *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + std::int64_t *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + float *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + double *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, float c, float s, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, double c, + double s, const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, float c, float s, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, + std::int64_t incy, double c, double s, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event rotg(cl::sycl::queue &queue, float *a, float *b, float *c, float *s, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event rotg(cl::sycl::queue &queue, double *a, double *b, double *c, double *s, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event rotg(cl::sycl::queue &queue, std::complex *a, std::complex *b, + float *c, std::complex *s, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event rotg(cl::sycl::queue &queue, std::complex *a, std::complex *b, + double *c, std::complex *s, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event rotm(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, float *param, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event rotm(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, + double *y, std::int64_t incy, double *param, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event rotmg(cl::sycl::queue &queue, float *d1, float *d2, float *x1, float y1, + float *param, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event rotmg(cl::sycl::queue &queue, double *d1, double *d2, double *x1, double y1, + double *param, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, float alpha, float *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, double alpha, double *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, float alpha, std::complex *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, double alpha, std::complex *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event sdsdot(cl::sycl::queue &queue, std::int64_t n, float sb, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, float *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, + double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, float alpha, const float *a, + std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, double alpha, const double *a, + std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *x, + std::int64_t incx, std::complex beta, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *x, + std::int64_t incx, std::complex beta, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + float alpha, const float *a, std::int64_t lda, const float *x, + std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + double alpha, const double *a, std::int64_t lda, const double *x, + std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, + const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, + const double *x, std::int64_t incx, const double *y, std::int64_t incy, + double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event hbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event hbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event hemv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event hemv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const std::complex *x, std::int64_t incx, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + const std::complex *x, std::int64_t incx, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event her2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event her2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event hpmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *a, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event hpmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *a, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const std::complex *x, std::int64_t incx, std::complex *a, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + const std::complex *x, std::int64_t incx, std::complex *a, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event hpr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event hpr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event sbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, + float alpha, const float *a, std::int64_t lda, const float *x, + std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event sbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, + double alpha, const double *a, std::int64_t lda, const double *x, + std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event spmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const float *a, const float *x, std::int64_t incx, float beta, float *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event spmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + const double *a, const double *x, std::int64_t incx, double beta, double *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const float *x, std::int64_t incx, float *a, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + const double *x, std::int64_t incx, double *a, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event spr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event spr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + const double *x, std::int64_t incx, const double *y, std::int64_t incy, + double *a, const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event symv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const float *a, std::int64_t lda, const float *x, std::int64_t incx, + float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event symv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + const double *a, std::int64_t lda, const double *x, std::int64_t incx, + double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const float *x, std::int64_t incx, float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + const double *x, std::int64_t incx, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event syr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event syr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + const double *x, std::int64_t incx, const double *y, std::int64_t incy, + double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const float *a, std::int64_t lda, float *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const double *a, std::int64_t lda, double *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const std::complex *a, std::int64_t lda, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const float *a, std::int64_t lda, float *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const double *a, std::int64_t lda, double *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const std::complex *a, std::int64_t lda, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const float *a, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const double *a, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex *a, std::complex *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex *a, std::complex *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const float *a, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const double *a, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex *a, std::complex *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex *a, std::complex *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const float *a, std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const double *a, std::int64_t lda, double *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex *a, std::int64_t lda, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex *a, std::int64_t lda, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const float *a, std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const double *a, std::int64_t lda, double *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex *a, std::int64_t lda, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex *a, std::int64_t lda, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, + const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, double alpha, const double *a, + std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event hemm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event hemm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event herk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, float alpha, const std::complex *a, std::int64_t lda, + float beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event herk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, double alpha, const std::complex *a, std::int64_t lda, + double beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event her2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *b, std::int64_t ldb, float beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event her2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *b, std::int64_t ldb, + double beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *b, + std::int64_t ldb, float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, double alpha, const double *a, std::int64_t lda, + const double *b, std::int64_t ldb, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, float alpha, const float *a, std::int64_t lda, float beta, + float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, double alpha, const double *a, std::int64_t lda, double beta, + double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, + std::int64_t ldb, float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, double alpha, const double *a, std::int64_t lda, + const double *b, std::int64_t ldb, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, float alpha, const float *a, + std::int64_t lda, float *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, double alpha, const double *a, + std::int64_t lda, double *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, std::complex *b, + std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, std::complex *b, + std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, float alpha, const float *a, + std::int64_t lda, float *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, double alpha, const double *a, + std::int64_t lda, double *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, std::complex *b, + std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, std::complex *b, + std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb, + std::int64_t *m, std::int64_t *n, std::int64_t *k, float *alpha, + const float **a, std::int64_t *lda, const float **b, std::int64_t *ldb, + float *beta, float **c, std::int64_t *ldc, std::int64_t group_count, + std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb, + std::int64_t *m, std::int64_t *n, std::int64_t *k, double *alpha, + const double **a, std::int64_t *lda, const double **b, std::int64_t *ldb, + double *beta, double **c, std::int64_t *ldc, std::int64_t group_count, + std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb, + std::int64_t *m, std::int64_t *n, std::int64_t *k, + std::complex *alpha, const std::complex **a, + std::int64_t *lda, const std::complex **b, std::int64_t *ldb, + std::complex *beta, std::complex **c, std::int64_t *ldc, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb, + std::int64_t *m, std::int64_t *n, std::int64_t *k, + std::complex *alpha, const std::complex **a, + std::int64_t *lda, const std::complex **b, std::int64_t *ldb, + std::complex *beta, std::complex **c, std::int64_t *ldc, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + const float *a, std::int64_t lda, std::int64_t stride_a, const float *b, + std::int64_t ldb, std::int64_t stride_b, float beta, float *c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, + const double *a, std::int64_t lda, std::int64_t stride_a, + const double *b, std::int64_t ldb, std::int64_t stride_b, double beta, + double *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, + std::int64_t lda, std::int64_t stride_a, const std::complex *b, + std::int64_t ldb, std::int64_t stride_b, std::complex beta, + std::complex *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, + std::int64_t lda, std::int64_t stride_a, const std::complex *b, + std::int64_t ldb, std::int64_t stride_b, std::complex beta, + std::complex *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, + std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, + const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, + std::int64_t n, std::int64_t k, double alpha, const double *a, + std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + } // namespace cublas } // namespace onemkl diff --git a/include/onemkl/blas/detail/mklcpu/blas_ct.hpp b/include/onemkl/blas/detail/mklcpu/blas_ct.hpp index 663f3d989..d816e2af6 100644 --- a/include/onemkl/blas/detail/mklcpu/blas_ct.hpp +++ b/include/onemkl/blas/detail/mklcpu/blas_ct.hpp @@ -33,14 +33,13 @@ #include "onemkl_blas_mklcpu.hpp" +#include "onemkl/blas/detail/blas_ct_templates.hpp" + namespace onemkl { namespace blas { -template -static inline void syr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer &a, std::int64_t lda); +// Buffer APIs + template <> void syr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, @@ -52,11 +51,6 @@ void syr2(cl::sycl::queue &queue, uplo upp syr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda); } -template -static inline void syr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer &a, std::int64_t lda); template <> void syr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, @@ -68,9 +62,6 @@ void syr2(cl::sycl::queue &queue, uplo upp syr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda); } -template -static inline void scal(cl::sycl::queue &queue, std::int64_t n, float alpha, - cl::sycl::buffer &x, std::int64_t incx); template <> void scal(cl::sycl::queue &queue, std::int64_t n, float alpha, cl::sycl::buffer &x, std::int64_t incx) { @@ -79,9 +70,6 @@ void scal(cl::sycl::queue &queue, std::int scal_postcondition(queue, n, alpha, x, incx); } -template -static inline void scal(cl::sycl::queue &queue, std::int64_t n, double alpha, - cl::sycl::buffer &x, std::int64_t incx); template <> void scal(cl::sycl::queue &queue, std::int64_t n, double alpha, cl::sycl::buffer &x, @@ -91,9 +79,6 @@ void scal(cl::sycl::queue &queue, std::int scal_postcondition(queue, n, alpha, x, incx); } -template -static inline void scal(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &x, std::int64_t incx); template <> void scal(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, @@ -104,9 +89,6 @@ void scal(cl::sycl::queue &queue, std::int scal_postcondition(queue, n, alpha, x, incx); } -template -static inline void scal(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &x, std::int64_t incx); template <> void scal(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, @@ -117,9 +99,6 @@ void scal(cl::sycl::queue &queue, std::int scal_postcondition(queue, n, alpha, x, incx); } -template -static inline void scal(cl::sycl::queue &queue, std::int64_t n, float alpha, - cl::sycl::buffer, 1> &x, std::int64_t incx); template <> void scal(cl::sycl::queue &queue, std::int64_t n, float alpha, cl::sycl::buffer, 1> &x, @@ -129,9 +108,6 @@ void scal(cl::sycl::queue &queue, std::int scal_postcondition(queue, n, alpha, x, incx); } -template -static inline void scal(cl::sycl::queue &queue, std::int64_t n, double alpha, - cl::sycl::buffer, 1> &x, std::int64_t incx); template <> void scal(cl::sycl::queue &queue, std::int64_t n, double alpha, @@ -142,10 +118,6 @@ void scal(cl::sycl::queue &queue, std::int scal_postcondition(queue, n, alpha, x, incx); } -template -static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx); template <> void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -156,10 +128,6 @@ void trmv(cl::sycl::queue &queue, uplo upp trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -template -static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx); template <> void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -170,11 +138,6 @@ void trmv(cl::sycl::queue &queue, uplo upp trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -template -static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx); template <> void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -187,11 +150,6 @@ void trmv(cl::sycl::queue &queue, uplo upp trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -template -static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx); template <> void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -204,10 +162,6 @@ void trmv(cl::sycl::queue &queue, uplo upp trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -template -static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer &a, - cl::sycl::buffer &x, std::int64_t incx); template <> void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -218,10 +172,6 @@ void tpmv(cl::sycl::queue &queue, uplo upp tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -template -static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer &a, - cl::sycl::buffer &x, std::int64_t incx); template <> void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -232,10 +182,6 @@ void tpmv(cl::sycl::queue &queue, uplo upp tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -template -static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &x, std::int64_t incx); template <> void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -247,10 +193,6 @@ void tpmv(cl::sycl::queue &queue, uplo upp tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -template -static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &x, std::int64_t incx); template <> void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -262,10 +204,6 @@ void tpmv(cl::sycl::queue &queue, uplo upp tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -template -static inline void spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &a); template <> void spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, @@ -276,10 +214,6 @@ void spr(cl::sycl::queue &queue, uplo uppe spr_postcondition(queue, upper_lower, n, alpha, x, incx, a); } -template -static inline void spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &a); template <> void spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, @@ -290,12 +224,6 @@ void spr(cl::sycl::queue &queue, uplo uppe spr_postcondition(queue, upper_lower, n, alpha, x, incx, a); } -template -static inline void hpmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, cl::sycl::buffer, 1> &y, - std::int64_t incy); template <> void hpmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, @@ -309,12 +237,6 @@ void hpmv(cl::sycl::queue &queue, uplo upp hpmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy); } -template -static inline void hpmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, cl::sycl::buffer, 1> &y, - std::int64_t incy); template <> void hpmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, @@ -328,11 +250,6 @@ void hpmv(cl::sycl::queue &queue, uplo upp hpmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy); } -template -static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, float alpha, cl::sycl::buffer &a, - std::int64_t lda, float beta, cl::sycl::buffer &c, - std::int64_t ldc); template <> void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -344,11 +261,6 @@ void syrk(cl::sycl::queue &queue, uplo upp syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -template -static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, double alpha, cl::sycl::buffer &a, - std::int64_t lda, double beta, cl::sycl::buffer &c, - std::int64_t ldc); template <> void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -360,12 +272,6 @@ void syrk(cl::sycl::queue &queue, uplo upp syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -template -static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void syrk( cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -376,12 +282,6 @@ void syrk( syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -template -static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void syrk( cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -392,12 +292,6 @@ void syrk( syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -template -static inline void her2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &x, - std::int64_t incx, cl::sycl::buffer, 1> &y, - std::int64_t incy, cl::sycl::buffer, 1> &a, - std::int64_t lda); template <> void her2( cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, @@ -409,12 +303,6 @@ void her2( her2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda); } -template -static inline void her2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &x, - std::int64_t incx, cl::sycl::buffer, 1> &y, - std::int64_t incy, cl::sycl::buffer, 1> &a, - std::int64_t lda); template <> void her2( cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, @@ -426,12 +314,6 @@ void her2( her2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda); } -template -static inline void hbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, - std::complex alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx, std::complex beta, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void hbmv( cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, @@ -443,12 +325,6 @@ void hbmv( hbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void hbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, - std::complex alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx, std::complex beta, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void hbmv( cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, @@ -460,11 +336,6 @@ void hbmv( hbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void rot(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy, float c, - float s); template <> void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -476,11 +347,6 @@ void rot(cl::sycl::queue &queue, std::int6 rot_postcondition(queue, n, x, incx, y, incy, c, s); } -template -static inline void rot(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy, double c, - double s); template <> void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -492,10 +358,6 @@ void rot(cl::sycl::queue &queue, std::int6 rot_postcondition(queue, n, x, incx, y, incy, c, s); } -template -static inline void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, float c, - float s); template <> void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -506,10 +368,6 @@ void rot(cl::sycl::queue &queue, std::int6 rot_postcondition(queue, n, x, incx, y, incy, c, s); } -template -static inline void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, - double c, double s); template <> void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -520,10 +378,6 @@ void rot(cl::sycl::queue &queue, std::int6 rot_postcondition(queue, n, x, incx, y, incy, c, s); } -template -static inline void axpy(cl::sycl::queue &queue, std::int64_t n, float alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &y, std::int64_t incy); template <> void axpy(cl::sycl::queue &queue, std::int64_t n, float alpha, cl::sycl::buffer &x, std::int64_t incx, @@ -533,10 +387,6 @@ void axpy(cl::sycl::queue &queue, std::int axpy_postcondition(queue, n, alpha, x, incx, y, incy); } -template -static inline void axpy(cl::sycl::queue &queue, std::int64_t n, double alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &y, std::int64_t incy); template <> void axpy(cl::sycl::queue &queue, std::int64_t n, double alpha, cl::sycl::buffer &x, @@ -547,10 +397,6 @@ void axpy(cl::sycl::queue &queue, std::int axpy_postcondition(queue, n, alpha, x, incx, y, incy); } -template -static inline void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, @@ -563,10 +409,6 @@ void axpy(cl::sycl::queue &queue, std::int axpy_postcondition(queue, n, alpha, x, incx, y, incy); } -template -static inline void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, @@ -579,12 +421,6 @@ void axpy(cl::sycl::queue &queue, std::int axpy_postcondition(queue, n, alpha, x, incx, y, incy); } -template -static inline void gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &x, - std::int64_t incx, cl::sycl::buffer, 1> &y, - std::int64_t incy, cl::sycl::buffer, 1> &a, - std::int64_t lda); template <> void gerc( cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, @@ -596,12 +432,6 @@ void gerc( gerc_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda); } -template -static inline void gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &x, - std::int64_t incx, cl::sycl::buffer, 1> &y, - std::int64_t incy, cl::sycl::buffer, 1> &a, - std::int64_t lda); template <> void gerc( cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, @@ -613,11 +443,6 @@ void gerc( gerc_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda); } -template -static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, float alpha, cl::sycl::buffer &a, - std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, - float beta, cl::sycl::buffer &c, std::int64_t ldc); template <> void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -630,11 +455,6 @@ void syr2k(cl::sycl::queue &queue, uplo up syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, double alpha, cl::sycl::buffer &a, - std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, - double beta, cl::sycl::buffer &c, std::int64_t ldc); template <> void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -647,13 +467,6 @@ void syr2k(cl::sycl::queue &queue, uplo up syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void syr2k( cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -665,13 +478,6 @@ void syr2k( syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void syr2k( cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -683,11 +489,6 @@ void syr2k( syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - float alpha, cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx, float beta, - cl::sycl::buffer &y, std::int64_t incy); template <> void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha, @@ -700,11 +501,6 @@ void gemv(cl::sycl::queue &queue, transpos gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - double alpha, cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx, double beta, - cl::sycl::buffer &y, std::int64_t incy); template <> void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, @@ -717,12 +513,6 @@ void gemv(cl::sycl::queue &queue, transpos gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx, std::complex beta, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void gemv( cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, @@ -734,12 +524,6 @@ void gemv( gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx, std::complex beta, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void gemv( cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, @@ -751,10 +535,6 @@ void gemv( gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &a, std::int64_t lda); template <> void her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, @@ -767,10 +547,6 @@ void her(cl::sycl::queue &queue, uplo uppe her_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda); } -template -static inline void her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &a, std::int64_t lda); template <> void her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, @@ -783,10 +559,6 @@ void her(cl::sycl::queue &queue, uplo uppe her_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda); } -template -static inline void hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &a); template <> void hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, @@ -798,10 +570,6 @@ void hpr(cl::sycl::queue &queue, uplo uppe hpr_postcondition(queue, upper_lower, n, alpha, x, incx, a); } -template -static inline void hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &a); template <> void hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, @@ -813,9 +581,6 @@ void hpr(cl::sycl::queue &queue, uplo uppe hpr_postcondition(queue, upper_lower, n, alpha, x, incx, a); } -template -static inline void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &result); template <> void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -825,9 +590,6 @@ void iamin(cl::sycl::queue &queue, std::in iamin_postcondition(queue, n, x, incx, result); } -template -static inline void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &result); template <> void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -837,10 +599,6 @@ void iamin(cl::sycl::queue &queue, std::in iamin_postcondition(queue, n, x, incx, result); } -template -static inline void iamin(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer &result); template <> void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -851,10 +609,6 @@ void iamin(cl::sycl::queue &queue, std::in iamin_postcondition(queue, n, x, incx, result); } -template -static inline void iamin(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer &result); template <> void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -865,128 +619,6 @@ void iamin(cl::sycl::queue &queue, std::in iamin_postcondition(queue, n, x, incx, result); } -template -static inline void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, - cl::sycl::buffer &m, - cl::sycl::buffer &n, - cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, - cl::sycl::buffer &beta, cl::sycl::buffer &c, - cl::sycl::buffer &ldc, std::int64_t group_count, - cl::sycl::buffer &group_size); -template <> -void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, cl::sycl::buffer &beta, - cl::sycl::buffer &c, cl::sycl::buffer &ldc, std::int64_t group_count, - cl::sycl::buffer &group_size) { - gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); - onemkl::mklcpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); - gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); -} - -template -static inline void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, cl::sycl::buffer &beta, - cl::sycl::buffer &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size); -template <> -void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, cl::sycl::buffer &beta, - cl::sycl::buffer &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size) { - gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); - onemkl::mklcpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); - gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); -} - -template -static inline void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size); -template <> -void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size) { - gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); - onemkl::mklcpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); - gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); -} - -template -static inline void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size); -template <> -void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size) { - gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); - onemkl::mklcpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); - gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); -} - -template -static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - cl::sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, cl::sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, float beta, - cl::sycl::buffer &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size); template <> void gemm_batch( cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1002,14 +634,6 @@ void gemm_batch( stride_b, beta, c, ldc, stride_c, batch_size); } -template -static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, double alpha, - cl::sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, cl::sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, double beta, - cl::sycl::buffer &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size); template <> void gemm_batch( cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1025,15 +649,6 @@ void gemm_batch( stride_b, beta, c, ldc, stride_c, batch_size); } -template -static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, cl::sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - cl::sycl::buffer, 1> &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size); template <> void gemm_batch( cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1050,15 +665,6 @@ void gemm_batch( stride_b, beta, c, ldc, stride_c, batch_size); } -template -static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, cl::sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - cl::sycl::buffer, 1> &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size); template <> void gemm_batch( cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1075,11 +681,6 @@ void gemm_batch( stride_b, beta, c, ldc, stride_c, batch_size); } -template -static inline void spmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - cl::sycl::buffer &a, cl::sycl::buffer &x, - std::int64_t incx, float beta, cl::sycl::buffer &y, - std::int64_t incy); template <> void spmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, @@ -1092,11 +693,6 @@ void spmv(cl::sycl::queue &queue, uplo upp spmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy); } -template -static inline void spmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - cl::sycl::buffer &a, cl::sycl::buffer &x, - std::int64_t incx, double beta, cl::sycl::buffer &y, - std::int64_t incy); template <> void spmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, @@ -1109,12 +705,6 @@ void spmv(cl::sycl::queue &queue, uplo upp spmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy); } -template -static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb, float beta, - cl::sycl::buffer &c, std::int64_t ldc); template <> void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, @@ -1128,13 +718,6 @@ void gemm_ext(cl::sycl::queue &queue, tran gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, - offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, cl::sycl::buffer &a, std::int64_t lda, - int8_t ao, cl::sycl::buffer &b, std::int64_t ldb, - uint8_t bo, float beta, cl::sycl::buffer &c, - std::int64_t ldc, cl::sycl::buffer &co); template <> void gemm_ext( cl::sycl::queue &queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, @@ -1149,12 +732,6 @@ void gemm_ext( beta, c, ldc, co); } -template -static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb, float beta, - cl::sycl::buffer &c, std::int64_t ldc); template <> void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, @@ -1168,12 +745,6 @@ void gemm_ext(cl::sycl::queue &queue, tran gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, double alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb, double beta, - cl::sycl::buffer &c, std::int64_t ldc); template <> void gemm_ext( cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1185,13 +756,6 @@ void gemm_ext( gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &b, - std::int64_t ldb, std::complex beta, - cl::sycl::buffer, 1> &c, std::int64_t ldc); template <> void gemm_ext( cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1203,14 +767,6 @@ void gemm_ext( gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void gemm_ext( cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1222,12 +778,6 @@ void gemm_ext( gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, half alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb, half beta, - cl::sycl::buffer &c, std::int64_t ldc); template <> void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, @@ -1241,9 +791,6 @@ void gemm_ext(cl::sycl::queue &queue, tran gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy); template <> void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -1253,9 +800,6 @@ void swap(cl::sycl::queue &queue, std::int swap_postcondition(queue, n, x, incx, y, incy); } -template -static inline void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy); template <> void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -1265,10 +809,6 @@ void swap(cl::sycl::queue &queue, std::int swap_postcondition(queue, n, x, incx, y, incy); } -template -static inline void swap(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -1280,10 +820,6 @@ void swap(cl::sycl::queue &queue, std::int swap_postcondition(queue, n, x, incx, y, incy); } -template -static inline void swap(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -1295,12 +831,6 @@ void swap(cl::sycl::queue &queue, std::int swap_postcondition(queue, n, x, incx, y, incy); } -template -static inline void geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &x, - std::int64_t incx, cl::sycl::buffer, 1> &y, - std::int64_t incy, cl::sycl::buffer, 1> &a, - std::int64_t lda); template <> void geru( cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, @@ -1312,12 +842,6 @@ void geru( geru_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda); } -template -static inline void geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &x, - std::int64_t incx, cl::sycl::buffer, 1> &y, - std::int64_t incy, cl::sycl::buffer, 1> &a, - std::int64_t lda); template <> void geru( cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, @@ -1329,10 +853,6 @@ void geru( geru_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda); } -template -static inline void nrm2(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer &result); template <> void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -1343,10 +863,6 @@ void nrm2(cl::sycl::queue &queue, std::int nrm2_postcondition(queue, n, x, incx, result); } -template -static inline void nrm2(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer &result); template <> void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -1357,9 +873,6 @@ void nrm2(cl::sycl::queue &queue, std::int nrm2_postcondition(queue, n, x, incx, result); } -template -static inline void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &result); template <> void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -1369,9 +882,6 @@ void nrm2(cl::sycl::queue &queue, std::int nrm2_postcondition(queue, n, x, incx, result); } -template -static inline void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &result); template <> void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -1381,11 +891,6 @@ void nrm2(cl::sycl::queue &queue, std::int nrm2_postcondition(queue, n, x, incx, result); } -template -static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer &a, - std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, - float beta, cl::sycl::buffer &c, std::int64_t ldc); template <> void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1399,12 +904,6 @@ void gemm(cl::sycl::queue &queue, transpos gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, double alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb, double beta, - cl::sycl::buffer &c, std::int64_t ldc); template <> void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1418,13 +917,6 @@ void gemm(cl::sycl::queue &queue, transpos gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void gemm( cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1436,13 +928,6 @@ void gemm( gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void gemm( cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1454,11 +939,6 @@ void gemm( gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, half alpha, cl::sycl::buffer &a, - std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, half beta, - cl::sycl::buffer &c, std::int64_t ldc); template <> void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1472,11 +952,6 @@ void gemm(cl::sycl::queue &queue, transpos gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void herk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, float alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, float beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void herk( cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -1487,11 +962,6 @@ void herk( herk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -template -static inline void herk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, double alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, double beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void herk( cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -1502,11 +972,6 @@ void herk( herk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -template -static inline void ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer &a, std::int64_t lda); template <> void ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, @@ -1518,11 +983,6 @@ void ger(cl::sycl::queue &queue, std::int6 ger_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda); } -template -static inline void ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer &a, std::int64_t lda); template <> void ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, @@ -1534,11 +994,6 @@ void ger(cl::sycl::queue &queue, std::int6 ger_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda); } -template -static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb); template <> void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -1553,11 +1008,6 @@ void trsm(cl::sycl::queue &queue, side lef ldb); } -template -static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb); template <> void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -1572,11 +1022,6 @@ void trsm(cl::sycl::queue &queue, side lef ldb); } -template -static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb); template <> void trsm( cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -1591,11 +1036,6 @@ void trsm( ldb); } -template -static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb); template <> void trsm( cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -1610,11 +1050,6 @@ void trsm( ldb); } -template -static inline void dotu(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy, - cl::sycl::buffer, 1> &result); template <> void dotu(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -1627,11 +1062,6 @@ void dotu(cl::sycl::queue &queue, std::int dotu_postcondition(queue, n, x, incx, y, incy, result); } -template -static inline void dotu(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy, - cl::sycl::buffer, 1> &result); template <> void dotu(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -1644,13 +1074,6 @@ void dotu(cl::sycl::queue &queue, std::int dotu_postcondition(queue, n, x, incx, y, incy, result); } -template -static inline void hemm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void hemm( cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, @@ -1662,13 +1085,6 @@ void hemm( hemm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void hemm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void hemm( cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, @@ -1680,11 +1096,6 @@ void hemm( hemm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void hpr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &x, - std::int64_t incx, cl::sycl::buffer, 1> &y, - std::int64_t incy, cl::sycl::buffer, 1> &a); template <> void hpr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, @@ -1698,11 +1109,6 @@ void hpr2(cl::sycl::queue &queue, uplo upp hpr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a); } -template -static inline void hpr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &x, - std::int64_t incx, cl::sycl::buffer, 1> &y, - std::int64_t incy, cl::sycl::buffer, 1> &a); template <> void hpr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, @@ -1716,12 +1122,6 @@ void hpr2(cl::sycl::queue &queue, uplo upp hpr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a); } -template -static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::int64_t kl, std::int64_t ku, float alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx, float beta, - cl::sycl::buffer &y, std::int64_t incy); template <> void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, @@ -1735,12 +1135,6 @@ void gbmv(cl::sycl::queue &queue, transpos gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::int64_t kl, std::int64_t ku, double alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx, double beta, - cl::sycl::buffer &y, std::int64_t incy); template <> void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, @@ -1754,13 +1148,6 @@ void gbmv(cl::sycl::queue &queue, transpos gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::int64_t kl, std::int64_t ku, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, cl::sycl::buffer, 1> &y, - std::int64_t incy); template <> void gbmv( cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, @@ -1772,13 +1159,6 @@ void gbmv( gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::int64_t kl, std::int64_t ku, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, cl::sycl::buffer, 1> &y, - std::int64_t incy); template <> void gbmv( cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, @@ -1790,10 +1170,6 @@ void gbmv( gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, cl::sycl::buffer &a, - std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx); template <> void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -1805,10 +1181,6 @@ void tbmv(cl::sycl::queue &queue, uplo upp tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -template -static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, cl::sycl::buffer &a, - std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx); template <> void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -1820,11 +1192,6 @@ void tbmv(cl::sycl::queue &queue, uplo upp tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -template -static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx); template <> void tbmv( cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -1835,11 +1202,6 @@ void tbmv( tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -template -static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &x, std::int64_t incx); template <> void tbmv( cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -1850,11 +1212,6 @@ void tbmv( tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -template -static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, float alpha, cl::sycl::buffer &a, - std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, - float beta, cl::sycl::buffer &c, std::int64_t ldc); template <> void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, @@ -1867,11 +1224,6 @@ void symm(cl::sycl::queue &queue, side lef symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, double alpha, cl::sycl::buffer &a, - std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, - double beta, cl::sycl::buffer &c, std::int64_t ldc); template <> void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, @@ -1884,13 +1236,6 @@ void symm(cl::sycl::queue &queue, side lef symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void symm( cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, @@ -1902,13 +1247,6 @@ void symm( symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void symm( cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, @@ -1920,11 +1258,6 @@ void symm( symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void dotc(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy, - cl::sycl::buffer, 1> &result); template <> void dotc(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -1937,11 +1270,6 @@ void dotc(cl::sycl::queue &queue, std::int dotc_postcondition(queue, n, x, incx, y, incy, result); } -template -static inline void dotc(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy, - cl::sycl::buffer, 1> &result); template <> void dotc(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -1954,10 +1282,6 @@ void dotc(cl::sycl::queue &queue, std::int dotc_postcondition(queue, n, x, incx, y, incy, result); } -template -static inline void syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &a, std::int64_t lda); template <> void syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, @@ -1968,10 +1292,6 @@ void syr(cl::sycl::queue &queue, uplo uppe syr_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda); } -template -static inline void syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &a, std::int64_t lda); template <> void syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, @@ -1982,11 +1302,6 @@ void syr(cl::sycl::queue &queue, uplo uppe syr_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda); } -template -static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb); template <> void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -2001,11 +1316,6 @@ void trmm(cl::sycl::queue &queue, side lef ldb); } -template -static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb); template <> void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -2020,11 +1330,6 @@ void trmm(cl::sycl::queue &queue, side lef ldb); } -template -static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb); template <> void trmm( cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -2039,11 +1344,6 @@ void trmm( ldb); } -template -static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb); template <> void trmm( cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -2058,10 +1358,6 @@ void trmm( ldb); } -template -static inline void rotmg(cl::sycl::queue &queue, cl::sycl::buffer &d1, - cl::sycl::buffer &d2, cl::sycl::buffer &x1, float y1, - cl::sycl::buffer ¶m); template <> void rotmg(cl::sycl::queue &queue, cl::sycl::buffer &d1, @@ -2073,10 +1369,6 @@ void rotmg(cl::sycl::queue &queue, rotmg_postcondition(queue, d1, d2, x1, y1, param); } -template -static inline void rotmg(cl::sycl::queue &queue, cl::sycl::buffer &d1, - cl::sycl::buffer &d2, cl::sycl::buffer &x1, - double y1, cl::sycl::buffer ¶m); template <> void rotmg(cl::sycl::queue &queue, cl::sycl::buffer &d1, @@ -2088,10 +1380,6 @@ void rotmg(cl::sycl::queue &queue, rotmg_postcondition(queue, d1, d2, x1, y1, param); } -template -static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer &a, - cl::sycl::buffer &x, std::int64_t incx); template <> void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2102,10 +1390,6 @@ void tpsv(cl::sycl::queue &queue, uplo upp tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -template -static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer &a, - cl::sycl::buffer &x, std::int64_t incx); template <> void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2116,10 +1400,6 @@ void tpsv(cl::sycl::queue &queue, uplo upp tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -template -static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &x, std::int64_t incx); template <> void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2131,10 +1411,6 @@ void tpsv(cl::sycl::queue &queue, uplo upp tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -template -static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &x, std::int64_t incx); template <> void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2146,10 +1422,6 @@ void tpsv(cl::sycl::queue &queue, uplo upp tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -template -static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx); template <> void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2160,10 +1432,6 @@ void trsv(cl::sycl::queue &queue, uplo upp trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -template -static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx); template <> void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2174,11 +1442,6 @@ void trsv(cl::sycl::queue &queue, uplo upp trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -template -static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx); template <> void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2191,11 +1454,6 @@ void trsv(cl::sycl::queue &queue, uplo upp trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -template -static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx); template <> void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2208,9 +1466,6 @@ void trsv(cl::sycl::queue &queue, uplo upp trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -template -static inline void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy); template <> void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2220,9 +1475,6 @@ void copy(cl::sycl::queue &queue, std::int copy_postcondition(queue, n, x, incx, y, incy); } -template -static inline void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy); template <> void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2232,10 +1484,6 @@ void copy(cl::sycl::queue &queue, std::int copy_postcondition(queue, n, x, incx, y, incy); } -template -static inline void copy(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -2247,10 +1495,6 @@ void copy(cl::sycl::queue &queue, std::int copy_postcondition(queue, n, x, incx, y, incy); } -template -static inline void copy(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -2262,12 +1506,6 @@ void copy(cl::sycl::queue &queue, std::int copy_postcondition(queue, n, x, incx, y, incy); } -template -static inline void hemv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx, std::complex beta, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void hemv( cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, @@ -2279,12 +1517,6 @@ void hemv( hemv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void hemv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx, std::complex beta, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void hemv( cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, @@ -2296,12 +1528,6 @@ void hemv( hemv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, - transpose transb, std::int64_t n, std::int64_t k, float alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb, float beta, - cl::sycl::buffer &c, std::int64_t ldc); template <> void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, @@ -2318,12 +1544,6 @@ void gemmt(cl::sycl::queue &queue, uplo up ldc); } -template -static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, - transpose transb, std::int64_t n, std::int64_t k, double alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb, double beta, - cl::sycl::buffer &c, std::int64_t ldc); template <> void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, @@ -2340,13 +1560,6 @@ void gemmt(cl::sycl::queue &queue, uplo up ldc); } -template -static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, - transpose transb, std::int64_t n, std::int64_t k, - std::complex alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &b, - std::int64_t ldb, std::complex beta, - cl::sycl::buffer, 1> &c, std::int64_t ldc); template <> void gemmt( cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, @@ -2361,13 +1574,6 @@ void gemmt( ldc); } -template -static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, - transpose transb, std::int64_t n, std::int64_t k, - std::complex alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &b, - std::int64_t ldb, std::complex beta, - cl::sycl::buffer, 1> &c, std::int64_t ldc); template <> void gemmt( cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, @@ -2382,11 +1588,6 @@ void gemmt( ldc); } -template -static inline void sbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, - float alpha, cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx, float beta, - cl::sycl::buffer &y, std::int64_t incy); template <> void sbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, float alpha, @@ -2399,11 +1600,6 @@ void sbmv(cl::sycl::queue &queue, uplo upp sbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void sbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, - double alpha, cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx, double beta, - cl::sycl::buffer &y, std::int64_t incy); template <> void sbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, double alpha, @@ -2416,10 +1612,6 @@ void sbmv(cl::sycl::queue &queue, uplo upp sbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void asum(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer &result); template <> void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -2430,10 +1622,6 @@ void asum(cl::sycl::queue &queue, std::int asum_postcondition(queue, n, x, incx, result); } -template -static inline void asum(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer &result); template <> void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -2444,9 +1632,6 @@ void asum(cl::sycl::queue &queue, std::int asum_postcondition(queue, n, x, incx, result); } -template -static inline void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &result); template <> void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2456,9 +1641,6 @@ void asum(cl::sycl::queue &queue, std::int asum_postcondition(queue, n, x, incx, result); } -template -static inline void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &result); template <> void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2468,10 +1650,6 @@ void asum(cl::sycl::queue &queue, std::int asum_postcondition(queue, n, x, incx, result); } -template -static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, cl::sycl::buffer &a, - std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx); template <> void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2483,10 +1661,6 @@ void tbsv(cl::sycl::queue &queue, uplo upp tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -template -static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, cl::sycl::buffer &a, - std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx); template <> void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2498,11 +1672,6 @@ void tbsv(cl::sycl::queue &queue, uplo upp tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -template -static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx); template <> void tbsv( cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2513,11 +1682,6 @@ void tbsv( tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -template -static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &x, std::int64_t incx); template <> void tbsv( cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2528,11 +1692,6 @@ void tbsv( tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -template -static inline void spr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer &a); template <> void spr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, @@ -2544,11 +1703,6 @@ void spr2(cl::sycl::queue &queue, uplo upp spr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a); } -template -static inline void spr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer &a); template <> void spr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, @@ -2560,9 +1714,6 @@ void spr2(cl::sycl::queue &queue, uplo upp spr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a); } -template -static inline void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &result); template <> void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2572,9 +1723,6 @@ void iamax(cl::sycl::queue &queue, std::in iamax_postcondition(queue, n, x, incx, result); } -template -static inline void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &result); template <> void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2584,10 +1732,6 @@ void iamax(cl::sycl::queue &queue, std::in iamax_postcondition(queue, n, x, incx, result); } -template -static inline void iamax(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer &result); template <> void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -2598,10 +1742,6 @@ void iamax(cl::sycl::queue &queue, std::in iamax_postcondition(queue, n, x, incx, result); } -template -static inline void iamax(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer &result); template <> void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -2612,118 +1752,6 @@ void iamax(cl::sycl::queue &queue, std::in iamax_postcondition(queue, n, x, incx, result); } -template -static inline void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, - cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, - cl::sycl::buffer &m, - cl::sycl::buffer &n, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, std::int64_t group_count, - cl::sycl::buffer &group_size); -template <> -void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, std::int64_t group_count, - cl::sycl::buffer &group_size) { - trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); - onemkl::mklcpu::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, - lda, b, ldb, group_count, group_size); - trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); -} - -template -static inline void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size); -template <> -void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { - trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); - onemkl::mklcpu::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, - lda, b, ldb, group_count, group_size); - trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); -} - -template -static inline void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size); -template <> -void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { - trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); - onemkl::mklcpu::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, - lda, b, ldb, group_count, group_size); - trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); -} - -template -static inline void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size); -template <> -void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { - trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); - onemkl::mklcpu::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, - lda, b, ldb, group_count, group_size); - trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); -} - -template -static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - float alpha, cl::sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, cl::sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template <> void trsm_batch( cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -2738,12 +1766,6 @@ void trsm_batch( stride_a, b, ldb, stride_b, batch_size); } -template -static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - double alpha, cl::sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, cl::sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template <> void trsm_batch( cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -2758,13 +1780,6 @@ void trsm_batch( stride_a, b, ldb, stride_b, batch_size); } -template -static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, cl::sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template <> void trsm_batch( cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -2780,13 +1795,6 @@ void trsm_batch( stride_a, b, ldb, stride_b, batch_size); } -template -static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, cl::sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template <> void trsm_batch( cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -2802,10 +1810,6 @@ void trsm_batch( stride_a, b, ldb, stride_b, batch_size); } -template -static inline void rotm(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer ¶m); template <> void rotm(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2816,10 +1820,6 @@ void rotm(cl::sycl::queue &queue, std::int rotm_postcondition(queue, n, x, incx, y, incy, param); } -template -static inline void rotm(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer ¶m); template <> void rotm(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2830,10 +1830,6 @@ void rotm(cl::sycl::queue &queue, std::int rotm_postcondition(queue, n, x, incx, y, incy, param); } -template -static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer &a, - cl::sycl::buffer &b, cl::sycl::buffer &c, - cl::sycl::buffer &s); template <> void rotg(cl::sycl::queue &queue, cl::sycl::buffer &a, @@ -2845,10 +1841,6 @@ void rotg(cl::sycl::queue &queue, rotg_postcondition(queue, a, b, c, s); } -template -static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer &a, - cl::sycl::buffer &b, cl::sycl::buffer &c, - cl::sycl::buffer &s); template <> void rotg(cl::sycl::queue &queue, cl::sycl::buffer &a, @@ -2860,10 +1852,6 @@ void rotg(cl::sycl::queue &queue, rotg_postcondition(queue, a, b, c, s); } -template -static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &c, - cl::sycl::buffer, 1> &s); template <> void rotg(cl::sycl::queue &queue, cl::sycl::buffer, 1> &a, @@ -2875,11 +1863,6 @@ void rotg(cl::sycl::queue &queue, rotg_postcondition(queue, a, b, c, s); } -template -static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &b, - cl::sycl::buffer &c, - cl::sycl::buffer, 1> &s); template <> void rotg(cl::sycl::queue &queue, cl::sycl::buffer, 1> &a, @@ -2891,11 +1874,6 @@ void rotg(cl::sycl::queue &queue, rotg_postcondition(queue, a, b, c, s); } -template -static inline void sdsdot(cl::sycl::queue &queue, std::int64_t n, float sb, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer &result); template <> void sdsdot(cl::sycl::queue &queue, std::int64_t n, float sb, cl::sycl::buffer &x, std::int64_t incx, @@ -2906,12 +1884,6 @@ void sdsdot(cl::sycl::queue &queue, std::i sdsdot_postcondition(queue, n, sb, x, incx, y, incy, result); } -template -static inline void her2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, float beta, - cl::sycl::buffer, 1> &c, std::int64_t ldc); template <> void her2k( cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -2923,13 +1895,6 @@ void her2k( her2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void her2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, - double beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void her2k( cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -2941,10 +1906,6 @@ void her2k( her2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer &result); template <> void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2955,10 +1916,6 @@ void dot(cl::sycl::queue &queue, std::int6 dot_postcondition(queue, n, x, incx, y, incy, result); } -template -static inline void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer &result); template <> void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2969,10 +1926,6 @@ void dot(cl::sycl::queue &queue, std::int6 dot_postcondition(queue, n, x, incx, y, incy, result); } -template -static inline void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer &result); template <> void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2983,11 +1936,6 @@ void dot(cl::sycl::queue &queue, std::int6 dot_postcondition(queue, n, x, incx, y, incy, result); } -template -static inline void symv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx, float beta, - cl::sycl::buffer &y, std::int64_t incy); template <> void symv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, @@ -3000,11 +1948,6 @@ void symv(cl::sycl::queue &queue, uplo upp symv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void symv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx, double beta, - cl::sycl::buffer &y, std::int64_t incy); template <> void symv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, @@ -3017,6 +1960,2068 @@ void symv(cl::sycl::queue &queue, uplo upp symv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } +// USM APIs + +template <> +cl::sycl::event syr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + syr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = + onemkl::mklcpu::syr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + syr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event syr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x, + std::int64_t incx, const double *y, std::int64_t incy, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + syr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = + onemkl::mklcpu::syr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + syr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, float alpha, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + scal_precondition(queue, n, alpha, x, incx, dependencies); + auto done = onemkl::mklcpu::scal(queue, n, alpha, x, incx, dependencies); + scal_postcondition(queue, n, alpha, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, double alpha, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + scal_precondition(queue, n, alpha, x, incx, dependencies); + auto done = onemkl::mklcpu::scal(queue, n, alpha, x, incx, dependencies); + scal_postcondition(queue, n, alpha, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, std::complex alpha, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies) { + scal_precondition(queue, n, alpha, x, incx, dependencies); + auto done = onemkl::mklcpu::scal(queue, n, alpha, x, incx, dependencies); + scal_postcondition(queue, n, alpha, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, std::complex alpha, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies) { + scal_precondition(queue, n, alpha, x, incx, dependencies); + auto done = onemkl::mklcpu::scal(queue, n, alpha, x, incx, dependencies); + scal_postcondition(queue, n, alpha, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, float alpha, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + scal_precondition(queue, n, alpha, x, incx, dependencies); + auto done = onemkl::mklcpu::scal(queue, n, alpha, x, incx, dependencies); + scal_postcondition(queue, n, alpha, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, double alpha, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies) { + scal_precondition(queue, n, alpha, x, incx, dependencies); + auto done = onemkl::mklcpu::scal(queue, n, alpha, x, incx, dependencies); + scal_postcondition(queue, n, alpha, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event trmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const float *a, std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = onemkl::mklcpu::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); + trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event trmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const double *a, std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = onemkl::mklcpu::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); + trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event trmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = onemkl::mklcpu::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); + trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event trmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = onemkl::mklcpu::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); + trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tpmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const float *a, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = + onemkl::mklcpu::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tpmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const double *a, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = + onemkl::mklcpu::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tpmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = + onemkl::mklcpu::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tpmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = + onemkl::mklcpu::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event spr( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x, + std::int64_t incx, float *a, const cl::sycl::vector_class &dependencies) { + spr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + auto done = onemkl::mklcpu::spr(queue, upper_lower, n, alpha, x, incx, a, dependencies); + spr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + return done; +} + +template <> +cl::sycl::event spr( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x, + std::int64_t incx, double *a, const cl::sycl::vector_class &dependencies) { + spr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + auto done = onemkl::mklcpu::spr(queue, upper_lower, n, alpha, x, incx, a, dependencies); + spr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + return done; +} + +template <> +cl::sycl::event hpmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *a, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + hpmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + auto done = + onemkl::mklcpu::hpmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + hpmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event hpmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *a, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + hpmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + auto done = + onemkl::mklcpu::hpmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + hpmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event syrk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + float alpha, const float *a, std::int64_t lda, float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + auto done = onemkl::mklcpu::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, + dependencies); + syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + return done; +} + +template <> +cl::sycl::event syrk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + double alpha, const double *a, std::int64_t lda, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + auto done = onemkl::mklcpu::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, + dependencies); + syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + return done; +} + +template <> +cl::sycl::event syrk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + auto done = onemkl::mklcpu::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, + dependencies); + syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + return done; +} + +template <> +cl::sycl::event syrk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + auto done = onemkl::mklcpu::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, + dependencies); + syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + return done; +} + +template <> +cl::sycl::event her2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + her2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = + onemkl::mklcpu::her2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + her2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event her2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + her2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = + onemkl::mklcpu::her2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + her2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event hbmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + hbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = onemkl::mklcpu::hbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, + incy, dependencies); + hbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +template <> +cl::sycl::event hbmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + hbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = onemkl::mklcpu::hbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, + incy, dependencies); + hbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +template <> +cl::sycl::event rot( + cl::sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, float c, float s, + const cl::sycl::vector_class &dependencies) { + rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies); + auto done = onemkl::mklcpu::rot(queue, n, x, incx, y, incy, c, s, dependencies); + rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies); + return done; +} + +template <> +cl::sycl::event rot( + cl::sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, double c, double s, + const cl::sycl::vector_class &dependencies) { + rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies); + auto done = onemkl::mklcpu::rot(queue, n, x, incx, y, incy, c, s, dependencies); + rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies); + return done; +} + +template <> +cl::sycl::event rot( + cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, float c, float s, + const cl::sycl::vector_class &dependencies) { + rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies); + auto done = onemkl::mklcpu::rot(queue, n, x, incx, y, incy, c, s, dependencies); + rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies); + return done; +} + +template <> +cl::sycl::event rot( + cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, + std::int64_t incy, double c, double s, + const cl::sycl::vector_class &dependencies) { + rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies); + auto done = onemkl::mklcpu::rot(queue, n, x, incx, y, incy, c, s, dependencies); + rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies); + return done; +} + +template <> +cl::sycl::event axpy( + cl::sycl::queue &queue, std::int64_t n, float alpha, const float *x, std::int64_t incx, + float *y, std::int64_t incy, const cl::sycl::vector_class &dependencies) { + axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies); + auto done = onemkl::mklcpu::axpy(queue, n, alpha, x, incx, y, incy, dependencies); + axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event axpy( + cl::sycl::queue &queue, std::int64_t n, double alpha, const double *x, std::int64_t incx, + double *y, std::int64_t incy, const cl::sycl::vector_class &dependencies) { + axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies); + auto done = onemkl::mklcpu::axpy(queue, n, alpha, x, incx, y, incy, dependencies); + axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event axpy( + cl::sycl::queue &queue, std::int64_t n, std::complex alpha, const std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies); + auto done = onemkl::mklcpu::axpy(queue, n, alpha, x, incx, y, incy, dependencies); + axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event axpy( + cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies); + auto done = onemkl::mklcpu::axpy(queue, n, alpha, x, incx, y, incy, dependencies); + axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event axpy_batch( + cl::sycl::queue &queue, std::int64_t *n, float *alpha, const float **x, std::int64_t *incx, + float **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + auto done = onemkl::mklcpu::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count, + group_size, dependencies); + axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + return done; +} + +template <> +cl::sycl::event axpy_batch( + cl::sycl::queue &queue, std::int64_t *n, double *alpha, const double **x, std::int64_t *incx, + double **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + auto done = onemkl::mklcpu::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count, + group_size, dependencies); + axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + return done; +} + +template <> +cl::sycl::event axpy_batch( + cl::sycl::queue &queue, std::int64_t *n, std::complex *alpha, + const std::complex **x, std::int64_t *incx, std::complex **y, std::int64_t *incy, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + auto done = onemkl::mklcpu::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count, + group_size, dependencies); + axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + return done; +} + +template <> +cl::sycl::event axpy_batch( + cl::sycl::queue &queue, std::int64_t *n, std::complex *alpha, + const std::complex **x, std::int64_t *incx, std::complex **y, + std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + auto done = onemkl::mklcpu::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count, + group_size, dependencies); + axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + return done; +} + +template <> +cl::sycl::event gerc( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + gerc_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = onemkl::mklcpu::gerc(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + gerc_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event gerc( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + gerc_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = onemkl::mklcpu::gerc(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + gerc_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event syr2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, float beta, + float *c, std::int64_t ldc, const cl::sycl::vector_class &dependencies) { + syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklcpu::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, + c, ldc, dependencies); + syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event syr2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + double alpha, const double *a, std::int64_t lda, const double *b, std::int64_t ldb, double beta, + double *c, std::int64_t ldc, const cl::sycl::vector_class &dependencies) { + syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklcpu::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, + c, ldc, dependencies); + syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event syr2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklcpu::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, + c, ldc, dependencies); + syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event syr2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklcpu::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, + c, ldc, dependencies); + syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event gemv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha, + const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies) { + gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = onemkl::mklcpu::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event gemv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, + const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies) { + gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = onemkl::mklcpu::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event gemv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = onemkl::mklcpu::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event gemv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = onemkl::mklcpu::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event her( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const std::complex *x, std::int64_t incx, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + her_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + auto done = onemkl::mklcpu::her(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + her_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event her( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + const std::complex *x, std::int64_t incx, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + her_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + auto done = onemkl::mklcpu::her(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + her_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event hpr( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const std::complex *x, std::int64_t incx, std::complex *a, + const cl::sycl::vector_class &dependencies) { + hpr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + auto done = onemkl::mklcpu::hpr(queue, upper_lower, n, alpha, x, incx, a, dependencies); + hpr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + return done; +} + +template <> +cl::sycl::event hpr( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + const std::complex *x, std::int64_t incx, std::complex *a, + const cl::sycl::vector_class &dependencies) { + hpr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + auto done = onemkl::mklcpu::hpr(queue, upper_lower, n, alpha, x, incx, a, dependencies); + hpr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + return done; +} + +template <> +cl::sycl::event iamin( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { + iamin_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklcpu::iamin(queue, n, x, incx, result, dependencies); + iamin_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event iamin( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies) { + iamin_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklcpu::iamin(queue, n, x, incx, result, dependencies); + iamin_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event iamin( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies) { + iamin_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklcpu::iamin(queue, n, x, incx, result, dependencies); + iamin_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event iamin( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies) { + iamin_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklcpu::iamin(queue, n, x, incx, result, dependencies); + iamin_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, float *alpha, const float **a, std::int64_t *lda, const float **b, + std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, std::int64_t group_count, + std::int64_t *group_size, const cl::sycl::vector_class &dependencies) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + auto done = onemkl::mklcpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, + beta, c, ldc, group_count, group_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + return done; +} + +template <> +cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, double *alpha, const double **a, std::int64_t *lda, const double **b, + std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc, std::int64_t group_count, + std::int64_t *group_size, const cl::sycl::vector_class &dependencies) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + auto done = onemkl::mklcpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, + beta, c, ldc, group_count, group_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + return done; +} + +template <> +cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, std::complex *alpha, const std::complex **a, std::int64_t *lda, + const std::complex **b, std::int64_t *ldb, std::complex *beta, + std::complex **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + auto done = onemkl::mklcpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, + beta, c, ldc, group_count, group_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + return done; +} + +template <> +cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, std::complex *alpha, const std::complex **a, std::int64_t *lda, + const std::complex **b, std::int64_t *ldb, std::complex *beta, + std::complex **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + auto done = onemkl::mklcpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, + beta, c, ldc, group_count, group_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + return done; +} + +template <> +cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const float *a, std::int64_t lda, std::int64_t stride_a, + const float *b, std::int64_t ldb, std::int64_t stride_b, float beta, float *c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const cl::sycl::vector_class &dependencies) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + auto done = + onemkl::mklcpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + return done; +} + +template <> +cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, double alpha, const double *a, std::int64_t lda, std::int64_t stride_a, + const double *b, std::int64_t ldb, std::int64_t stride_b, double beta, double *c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const cl::sycl::vector_class &dependencies) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + auto done = + onemkl::mklcpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + return done; +} + +template <> +cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + std::int64_t stride_a, const std::complex *b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, std::complex *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const cl::sycl::vector_class &dependencies) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + auto done = + onemkl::mklcpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + return done; +} + +template <> +cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + std::int64_t stride_a, const std::complex *b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, std::complex *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const cl::sycl::vector_class &dependencies) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + auto done = + onemkl::mklcpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + return done; +} + +template <> +cl::sycl::event spmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *a, + const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + spmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + auto done = + onemkl::mklcpu::spmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + spmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event spmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *a, + const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + spmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + auto done = + onemkl::mklcpu::spmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + spmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event swap( + cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies) { + swap_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = onemkl::mklcpu::swap(queue, n, x, incx, y, incy, dependencies); + swap_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event swap( + cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies) { + swap_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = onemkl::mklcpu::swap(queue, n, x, incx, y, incy, dependencies); + swap_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event swap( + cl::sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + swap_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = onemkl::mklcpu::swap(queue, n, x, incx, y, incy, dependencies); + swap_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event swap( + cl::sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + swap_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = onemkl::mklcpu::swap(queue, n, x, incx, y, incy, dependencies); + swap_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event geru( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + geru_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = onemkl::mklcpu::geru(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + geru_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event geru( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + geru_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = onemkl::mklcpu::geru(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + geru_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event nrm2( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + float *result, const cl::sycl::vector_class &dependencies) { + nrm2_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklcpu::nrm2(queue, n, x, incx, result, dependencies); + nrm2_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event nrm2( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + double *result, const cl::sycl::vector_class &dependencies) { + nrm2_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklcpu::nrm2(queue, n, x, incx, result, dependencies); + nrm2_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event nrm2( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies) { + nrm2_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklcpu::nrm2(queue, n, x, incx, result, dependencies); + nrm2_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event nrm2( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies) { + nrm2_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklcpu::nrm2(queue, n, x, incx, result, dependencies); + nrm2_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event gemm( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, + float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklcpu::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event gemm( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *b, + std::int64_t ldb, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklcpu::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event gemm( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklcpu::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event gemm( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklcpu::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event herk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + float alpha, const std::complex *a, std::int64_t lda, float beta, std::complex *c, + std::int64_t ldc, const cl::sycl::vector_class &dependencies) { + herk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + auto done = onemkl::mklcpu::herk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, + dependencies); + herk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + return done; +} + +template <> +cl::sycl::event herk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + double alpha, const std::complex *a, std::int64_t lda, double beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + herk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + auto done = onemkl::mklcpu::herk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, + dependencies); + herk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + return done; +} + +template <> +cl::sycl::event ger( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + ger_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = onemkl::mklcpu::ger(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + ger_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event ger( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, const double *x, + std::int64_t incx, const double *y, std::int64_t incy, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + ger_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = onemkl::mklcpu::ger(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + ger_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event trsm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b, + std::int64_t ldb, const cl::sycl::vector_class &dependencies) { + trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = onemkl::mklcpu::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +template <> +cl::sycl::event trsm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b, + std::int64_t ldb, const cl::sycl::vector_class &dependencies) { + trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = onemkl::mklcpu::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +template <> +cl::sycl::event trsm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = onemkl::mklcpu::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +template <> +cl::sycl::event trsm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = onemkl::mklcpu::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +template <> +cl::sycl::event dotu( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies) { + dotu_precondition(queue, n, x, incx, y, incy, result, dependencies); + auto done = onemkl::mklcpu::dotu(queue, n, x, incx, y, incy, result, dependencies); + dotu_postcondition(queue, n, x, incx, y, incy, result, dependencies); + return done; +} + +template <> +cl::sycl::event dotu( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies) { + dotu_precondition(queue, n, x, incx, y, incy, result, dependencies); + auto done = onemkl::mklcpu::dotu(queue, n, x, incx, y, incy, result, dependencies); + dotu_postcondition(queue, n, x, incx, y, incy, result, dependencies); + return done; +} + +template <> +cl::sycl::event hemm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + hemm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklcpu::hemm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, + beta, c, ldc, dependencies); + hemm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event hemm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + hemm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklcpu::hemm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, + beta, c, ldc, dependencies); + hemm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event hpr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, + const cl::sycl::vector_class &dependencies) { + hpr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + auto done = + onemkl::mklcpu::hpr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + hpr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + return done; +} + +template <> +cl::sycl::event hpr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, + const cl::sycl::vector_class &dependencies) { + hpr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + auto done = + onemkl::mklcpu::hpr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + hpr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + return done; +} + +template <> +cl::sycl::event gbmv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, float alpha, const float *a, std::int64_t lda, const float *x, + std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = onemkl::mklcpu::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, + incy, dependencies); + gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +template <> +cl::sycl::event gbmv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, double alpha, const double *a, std::int64_t lda, const double *x, + std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = onemkl::mklcpu::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, + incy, dependencies); + gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +template <> +cl::sycl::event gbmv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = onemkl::mklcpu::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, + incy, dependencies); + gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +template <> +cl::sycl::event gbmv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = onemkl::mklcpu::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, + incy, dependencies); + gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +template <> +cl::sycl::event tbmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = onemkl::mklcpu::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, + dependencies); + tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tbmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = onemkl::mklcpu::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, + dependencies); + tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tbmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies) { + tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = onemkl::mklcpu::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, + dependencies); + tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tbmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies) { + tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = onemkl::mklcpu::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, + dependencies); + tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event symm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, float beta, + float *c, std::int64_t ldc, const cl::sycl::vector_class &dependencies) { + symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklcpu::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, + beta, c, ldc, dependencies); + symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event symm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + double alpha, const double *a, std::int64_t lda, const double *b, std::int64_t ldb, double beta, + double *c, std::int64_t ldc, const cl::sycl::vector_class &dependencies) { + symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklcpu::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, + beta, c, ldc, dependencies); + symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event symm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklcpu::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, + beta, c, ldc, dependencies); + symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event symm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklcpu::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, + beta, c, ldc, dependencies); + symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event dotc( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies) { + dotc_precondition(queue, n, x, incx, y, incy, result, dependencies); + auto done = onemkl::mklcpu::dotc(queue, n, x, incx, y, incy, result, dependencies); + dotc_postcondition(queue, n, x, incx, y, incy, result, dependencies); + return done; +} + +template <> +cl::sycl::event dotc( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies) { + dotc_precondition(queue, n, x, incx, y, incy, result, dependencies); + auto done = onemkl::mklcpu::dotc(queue, n, x, incx, y, incy, result, dependencies); + dotc_postcondition(queue, n, x, incx, y, incy, result, dependencies); + return done; +} + +template <> +cl::sycl::event syr( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x, + std::int64_t incx, float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + syr_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + auto done = onemkl::mklcpu::syr(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + syr_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event syr( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x, + std::int64_t incx, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + syr_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + auto done = onemkl::mklcpu::syr(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + syr_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event trmm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b, + std::int64_t ldb, const cl::sycl::vector_class &dependencies) { + trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = onemkl::mklcpu::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +template <> +cl::sycl::event trmm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b, + std::int64_t ldb, const cl::sycl::vector_class &dependencies) { + trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = onemkl::mklcpu::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +template <> +cl::sycl::event trmm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = onemkl::mklcpu::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +template <> +cl::sycl::event trmm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = onemkl::mklcpu::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +template <> +cl::sycl::event rotmg( + cl::sycl::queue &queue, float *d1, float *d2, float *x1, float y1, float *param, + const cl::sycl::vector_class &dependencies) { + rotmg_precondition(queue, d1, d2, x1, y1, param, dependencies); + auto done = onemkl::mklcpu::rotmg(queue, d1, d2, x1, y1, param, dependencies); + rotmg_postcondition(queue, d1, d2, x1, y1, param, dependencies); + return done; +} + +template <> +cl::sycl::event rotmg( + cl::sycl::queue &queue, double *d1, double *d2, double *x1, double y1, double *param, + const cl::sycl::vector_class &dependencies) { + rotmg_precondition(queue, d1, d2, x1, y1, param, dependencies); + auto done = onemkl::mklcpu::rotmg(queue, d1, d2, x1, y1, param, dependencies); + rotmg_postcondition(queue, d1, d2, x1, y1, param, dependencies); + return done; +} + +template <> +cl::sycl::event tpsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const float *a, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = + onemkl::mklcpu::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tpsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const double *a, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = + onemkl::mklcpu::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tpsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = + onemkl::mklcpu::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tpsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = + onemkl::mklcpu::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event trsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const float *a, std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = onemkl::mklcpu::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); + trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event trsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const double *a, std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = onemkl::mklcpu::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); + trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event trsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = onemkl::mklcpu::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); + trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event trsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = onemkl::mklcpu::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); + trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event copy( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies) { + copy_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = onemkl::mklcpu::copy(queue, n, x, incx, y, incy, dependencies); + copy_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event copy( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies) { + copy_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = onemkl::mklcpu::copy(queue, n, x, incx, y, incy, dependencies); + copy_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event copy( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + copy_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = onemkl::mklcpu::copy(queue, n, x, incx, y, incy, dependencies); + copy_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event copy( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + copy_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = onemkl::mklcpu::copy(queue, n, x, incx, y, incy, dependencies); + copy_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event hemv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + hemv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = onemkl::mklcpu::hemv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + hemv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event hemv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *x, + std::int64_t incx, std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + hemv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = onemkl::mklcpu::hemv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + hemv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event gemmt( + cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, + float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + auto done = onemkl::mklcpu::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, + ldb, beta, c, ldc, dependencies); + gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + return done; +} + +template <> +cl::sycl::event gemmt( + cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *b, + std::int64_t ldb, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + auto done = onemkl::mklcpu::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, + ldb, beta, c, ldc, dependencies); + gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + return done; +} + +template <> +cl::sycl::event gemmt( + cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + auto done = onemkl::mklcpu::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, + ldb, beta, c, ldc, dependencies); + gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + return done; +} + +template <> +cl::sycl::event gemmt( + cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + auto done = onemkl::mklcpu::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, + ldb, beta, c, ldc, dependencies); + gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + return done; +} + +template <> +cl::sycl::event sbmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, float alpha, + const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies) { + sbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = onemkl::mklcpu::sbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, + incy, dependencies); + sbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +template <> +cl::sycl::event sbmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, double alpha, + const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies) { + sbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = onemkl::mklcpu::sbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, + incy, dependencies); + sbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +template <> +cl::sycl::event asum( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + float *result, const cl::sycl::vector_class &dependencies) { + asum_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklcpu::asum(queue, n, x, incx, result, dependencies); + asum_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event asum( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + double *result, const cl::sycl::vector_class &dependencies) { + asum_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklcpu::asum(queue, n, x, incx, result, dependencies); + asum_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event asum( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies) { + asum_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklcpu::asum(queue, n, x, incx, result, dependencies); + asum_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event asum( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies) { + asum_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklcpu::asum(queue, n, x, incx, result, dependencies); + asum_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event tbsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = onemkl::mklcpu::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, + dependencies); + tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tbsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = onemkl::mklcpu::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, + dependencies); + tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tbsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies) { + tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = onemkl::mklcpu::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, + dependencies); + tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tbsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies) { + tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = onemkl::mklcpu::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, + dependencies); + tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event spr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, float *a, + const cl::sycl::vector_class &dependencies) { + spr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + auto done = + onemkl::mklcpu::spr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + spr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + return done; +} + +template <> +cl::sycl::event spr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x, + std::int64_t incx, const double *y, std::int64_t incy, double *a, + const cl::sycl::vector_class &dependencies) { + spr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + auto done = + onemkl::mklcpu::spr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + spr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + return done; +} + +template <> +cl::sycl::event iamax( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { + iamax_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklcpu::iamax(queue, n, x, incx, result, dependencies); + iamax_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event iamax( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies) { + iamax_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklcpu::iamax(queue, n, x, incx, result, dependencies); + iamax_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event iamax( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies) { + iamax_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklcpu::iamax(queue, n, x, incx, result, dependencies); + iamax_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event iamax( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies) { + iamax_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklcpu::iamax(queue, n, x, incx, result, dependencies); + iamax_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event rotm( + cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, float *param, const cl::sycl::vector_class &dependencies) { + rotm_precondition(queue, n, x, incx, y, incy, param, dependencies); + auto done = onemkl::mklcpu::rotm(queue, n, x, incx, y, incy, param, dependencies); + rotm_postcondition(queue, n, x, incx, y, incy, param, dependencies); + return done; +} + +template <> +cl::sycl::event rotm( + cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, + std::int64_t incy, double *param, const cl::sycl::vector_class &dependencies) { + rotm_precondition(queue, n, x, incx, y, incy, param, dependencies); + auto done = onemkl::mklcpu::rotm(queue, n, x, incx, y, incy, param, dependencies); + rotm_postcondition(queue, n, x, incx, y, incy, param, dependencies); + return done; +} + +template <> +cl::sycl::event rotg( + cl::sycl::queue &queue, float *a, float *b, float *c, float *s, + const cl::sycl::vector_class &dependencies) { + rotg_precondition(queue, a, b, c, s, dependencies); + auto done = onemkl::mklcpu::rotg(queue, a, b, c, s, dependencies); + rotg_postcondition(queue, a, b, c, s, dependencies); + return done; +} + +template <> +cl::sycl::event rotg( + cl::sycl::queue &queue, double *a, double *b, double *c, double *s, + const cl::sycl::vector_class &dependencies) { + rotg_precondition(queue, a, b, c, s, dependencies); + auto done = onemkl::mklcpu::rotg(queue, a, b, c, s, dependencies); + rotg_postcondition(queue, a, b, c, s, dependencies); + return done; +} + +template <> +cl::sycl::event rotg( + cl::sycl::queue &queue, std::complex *a, std::complex *b, float *c, + std::complex *s, const cl::sycl::vector_class &dependencies) { + rotg_precondition(queue, a, b, c, s, dependencies); + auto done = onemkl::mklcpu::rotg(queue, a, b, c, s, dependencies); + rotg_postcondition(queue, a, b, c, s, dependencies); + return done; +} + +template <> +cl::sycl::event rotg( + cl::sycl::queue &queue, std::complex *a, std::complex *b, double *c, + std::complex *s, const cl::sycl::vector_class &dependencies) { + rotg_precondition(queue, a, b, c, s, dependencies); + auto done = onemkl::mklcpu::rotg(queue, a, b, c, s, dependencies); + rotg_postcondition(queue, a, b, c, s, dependencies); + return done; +} + +template <> +cl::sycl::event sdsdot( + cl::sycl::queue &queue, std::int64_t n, float sb, const float *x, std::int64_t incx, + const float *y, std::int64_t incy, float *result, + const cl::sycl::vector_class &dependencies) { + sdsdot_precondition(queue, n, sb, x, incx, y, incy, result, dependencies); + auto done = onemkl::mklcpu::sdsdot(queue, n, sb, x, incx, y, incy, result, dependencies); + sdsdot_postcondition(queue, n, sb, x, incx, y, incy, result, dependencies); + return done; +} + +template <> +cl::sycl::event her2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, float beta, std::complex *c, + std::int64_t ldc, const cl::sycl::vector_class &dependencies) { + her2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklcpu::her2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, + c, ldc, dependencies); + her2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event her2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, double beta, std::complex *c, + std::int64_t ldc, const cl::sycl::vector_class &dependencies) { + her2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklcpu::her2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, + c, ldc, dependencies); + her2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event dot( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, const float *y, + std::int64_t incy, float *result, const cl::sycl::vector_class &dependencies) { + dot_precondition(queue, n, x, incx, y, incy, result, dependencies); + auto done = onemkl::mklcpu::dot(queue, n, x, incx, y, incy, result, dependencies); + dot_postcondition(queue, n, x, incx, y, incy, result, dependencies); + return done; +} + +template <> +cl::sycl::event dot( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, const double *y, + std::int64_t incy, double *result, + const cl::sycl::vector_class &dependencies) { + dot_precondition(queue, n, x, incx, y, incy, result, dependencies); + auto done = onemkl::mklcpu::dot(queue, n, x, incx, y, incy, result, dependencies); + dot_postcondition(queue, n, x, incx, y, incy, result, dependencies); + return done; +} + +template <> +cl::sycl::event dot( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, const float *y, + std::int64_t incy, double *result, + const cl::sycl::vector_class &dependencies) { + dot_precondition(queue, n, x, incx, y, incy, result, dependencies); + auto done = onemkl::mklcpu::dot(queue, n, x, incx, y, incy, result, dependencies); + dot_postcondition(queue, n, x, incx, y, incy, result, dependencies); + return done; +} + +template <> +cl::sycl::event symv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *a, + std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + symv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = onemkl::mklcpu::symv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + symv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event symv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *a, + std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + symv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = onemkl::mklcpu::symv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + symv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + } //namespace blas } //namespace onemkl diff --git a/include/onemkl/blas/detail/mklcpu/onemkl_blas_mklcpu.hpp b/include/onemkl/blas/detail/mklcpu/onemkl_blas_mklcpu.hpp index a64df93f4..b163cb144 100644 --- a/include/onemkl/blas/detail/mklcpu/onemkl_blas_mklcpu.hpp +++ b/include/onemkl/blas/detail/mklcpu/onemkl_blas_mklcpu.hpp @@ -38,7 +38,7 @@ using onemkl::offset; namespace mklcpu { -// Level 1 +// Buffer APIs ONEMKL_EXPORT void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, std::int64_t incx, @@ -246,8 +246,6 @@ ONEMKL_EXPORT void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy); -// Level 2 - ONEMKL_EXPORT void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, cl::sycl::buffer &a, std::int64_t lda, @@ -574,8 +572,6 @@ ONEMKL_EXPORT void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose tran std::int64_t lda, cl::sycl::buffer, 1> &x, std::int64_t incx); -// Level 3 - ONEMKL_EXPORT void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, @@ -753,50 +749,6 @@ ONEMKL_EXPORT void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lowe cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &b, std::int64_t ldb); -// Batch API - -ONEMKL_EXPORT void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, - cl::sycl::buffer &m, - cl::sycl::buffer &n, - cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, - cl::sycl::buffer &beta, cl::sycl::buffer &c, - cl::sycl::buffer &ldc, std::int64_t group_count, - cl::sycl::buffer &group_size); - -ONEMKL_EXPORT void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, cl::sycl::buffer &beta, - cl::sycl::buffer &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size); - -ONEMKL_EXPORT void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size); - -ONEMKL_EXPORT void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size); - ONEMKL_EXPORT void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer &a, std::int64_t lda, @@ -831,44 +783,6 @@ ONEMKL_EXPORT void gemm_batch(cl::sycl::queue &queue, transpose transa, transpos cl::sycl::buffer, 1> &c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, - cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, - cl::sycl::buffer &m, - cl::sycl::buffer &n, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, std::int64_t group_count, - cl::sycl::buffer &group_size); - -ONEMKL_EXPORT void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size); - -ONEMKL_EXPORT void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size); - -ONEMKL_EXPORT void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size); - ONEMKL_EXPORT void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, cl::sycl::buffer &a, std::int64_t lda, @@ -895,8 +809,6 @@ ONEMKL_EXPORT void trsm_batch(cl::sycl::queue &queue, side left_right, uplo uppe std::int64_t stride_a, cl::sycl::buffer, 1> &b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); -// BLAS-like extensions - ONEMKL_EXPORT void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer &a, std::int64_t lda, @@ -969,6 +881,876 @@ ONEMKL_EXPORT void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose cl::sycl::buffer &b, std::int64_t ldb, half beta, cl::sycl::buffer &c, std::int64_t ldc); +// USM APIs + +ONEMKL_EXPORT cl::sycl::event asum( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + float *result, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event asum( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + double *result, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event asum( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event asum( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event axpy( + cl::sycl::queue &queue, std::int64_t n, float alpha, const float *x, std::int64_t incx, + float *y, std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event axpy( + cl::sycl::queue &queue, std::int64_t n, double alpha, const double *x, std::int64_t incx, + double *y, std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event axpy( + cl::sycl::queue &queue, std::int64_t n, std::complex alpha, const std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event axpy( + cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event axpy_batch( + cl::sycl::queue &queue, std::int64_t *n, float *alpha, const float **x, std::int64_t *incx, + float **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event axpy_batch( + cl::sycl::queue &queue, std::int64_t *n, double *alpha, const double **x, std::int64_t *incx, + double **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event axpy_batch( + cl::sycl::queue &queue, std::int64_t *n, std::complex *alpha, + const std::complex **x, std::int64_t *incx, std::complex **y, std::int64_t *incy, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event axpy_batch( + cl::sycl::queue &queue, std::int64_t *n, std::complex *alpha, + const std::complex **x, std::int64_t *incx, std::complex **y, + std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event copy( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event copy( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event copy( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event copy( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, + float *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const double *x, + std::int64_t incx, const double *y, std::int64_t incy, + double *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, + double *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event dotc( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event dotc( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event dotu( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event dotu( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event iamin( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event iamin( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event iamin( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event iamin( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event iamax( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event iamax( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event iamax( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event iamax( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event nrm2( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + float *result, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event nrm2( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + double *result, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event nrm2( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event nrm2( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + float c, float s, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + double c, double s, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, float *x, + std::int64_t incx, float *y, std::int64_t incy, float c, float s, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, double *x, + std::int64_t incx, double *y, std::int64_t incy, double c, + double s, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event rotg( + cl::sycl::queue &queue, float *a, float *b, float *c, float *s, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event rotg( + cl::sycl::queue &queue, double *a, double *b, double *c, double *s, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event rotg( + cl::sycl::queue &queue, std::complex *a, std::complex *b, float *c, + std::complex *s, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event rotg( + cl::sycl::queue &queue, std::complex *a, std::complex *b, double *c, + std::complex *s, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event rotm( + cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, float *param, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event rotm( + cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, + std::int64_t incy, double *param, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event rotmg( + cl::sycl::queue &queue, float *d1, float *d2, float *x1, float y1, float *param, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event rotmg( + cl::sycl::queue &queue, double *d1, double *d2, double *x1, double y1, double *param, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, float alpha, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, double alpha, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, std::complex alpha, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, std::complex alpha, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, float alpha, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, double alpha, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event sdsdot( + cl::sycl::queue &queue, std::int64_t n, float sb, const float *x, std::int64_t incx, + const float *y, std::int64_t incy, float *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event swap( + cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event swap( + cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event swap( + cl::sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event swap( + cl::sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gbmv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, float alpha, const float *a, std::int64_t lda, const float *x, + std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gbmv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, double alpha, const double *a, std::int64_t lda, const double *x, + std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gbmv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gbmv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha, + const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, + const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + float alpha, const float *x, std::int64_t incx, const float *y, + std::int64_t incy, float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + double alpha, const double *x, std::int64_t incx, const double *y, + std::int64_t incy, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gerc( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gerc( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event geru( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event geru( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event hbmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event hbmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event hemv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event hemv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *x, + std::int64_t incx, std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + float alpha, const std::complex *x, std::int64_t incx, + std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + double alpha, const std::complex *x, std::int64_t incx, + std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event her2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event her2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event hpmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *a, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event hpmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *a, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + float alpha, const std::complex *x, std::int64_t incx, + std::complex *a, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + double alpha, const std::complex *x, std::int64_t incx, + std::complex *a, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event hpr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event hpr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event sbmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, float alpha, + const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event sbmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, double alpha, + const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event spmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *a, + const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event spmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *a, + const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + float alpha, const float *x, std::int64_t incx, float *a, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + double alpha, const double *x, std::int64_t incx, double *a, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event spr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, float *a, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event spr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x, + std::int64_t incx, const double *y, std::int64_t incy, double *a, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event symv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *a, + std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event symv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *a, + std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + float alpha, const float *x, std::int64_t incx, float *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + double alpha, const double *x, std::int64_t incx, double *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event syr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event syr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x, + std::int64_t incx, const double *y, std::int64_t incy, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tbmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tbmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tbmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tbmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tbsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tbsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tbsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tbsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tpmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const float *a, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tpmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const double *a, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tpmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tpmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tpsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const float *a, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tpsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const double *a, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tpsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tpsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const float *a, std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const double *a, std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const float *a, std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const double *a, std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemm( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, + float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemm( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *b, + std::int64_t ldb, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemm( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemm( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event hemm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event hemm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event herk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + float alpha, const std::complex *a, std::int64_t lda, float beta, std::complex *c, + std::int64_t ldc, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event herk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + double alpha, const std::complex *a, std::int64_t lda, double beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event her2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, float beta, std::complex *c, + std::int64_t ldc, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event her2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, double beta, std::complex *c, + std::int64_t ldc, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event symm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, float beta, + float *c, std::int64_t ldc, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event symm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + double alpha, const double *a, std::int64_t lda, const double *b, std::int64_t ldb, double beta, + double *c, std::int64_t ldc, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event symm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event symm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event syrk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + float alpha, const float *a, std::int64_t lda, float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event syrk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + double alpha, const double *a, std::int64_t lda, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event syrk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event syrk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event syr2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, float beta, + float *c, std::int64_t ldc, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event syr2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + double alpha, const double *a, std::int64_t lda, const double *b, std::int64_t ldb, double beta, + double *c, std::int64_t ldc, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event syr2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event syr2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trmm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b, + std::int64_t ldb, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trmm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b, + std::int64_t ldb, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trmm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trmm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trsm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b, + std::int64_t ldb, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trsm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b, + std::int64_t ldb, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trsm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trsm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, float *alpha, const float **a, std::int64_t *lda, const float **b, + std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, std::int64_t group_count, + std::int64_t *group_size, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, double *alpha, const double **a, std::int64_t *lda, const double **b, + std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc, std::int64_t group_count, + std::int64_t *group_size, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, std::complex *alpha, const std::complex **a, std::int64_t *lda, + const std::complex **b, std::int64_t *ldb, std::complex *beta, + std::complex **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, std::complex *alpha, const std::complex **a, std::int64_t *lda, + const std::complex **b, std::int64_t *ldb, std::complex *beta, + std::complex **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const float *a, std::int64_t lda, std::int64_t stride_a, + const float *b, std::int64_t ldb, std::int64_t stride_b, float beta, float *c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, double alpha, const double *a, std::int64_t lda, std::int64_t stride_a, + const double *b, std::int64_t ldb, std::int64_t stride_b, double beta, double *c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + std::int64_t stride_a, const std::complex *b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, std::complex *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + std::int64_t stride_a, const std::complex *b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, std::complex *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemmt( + cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, + float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemmt( + cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *b, + std::int64_t ldb, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemmt( + cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemmt( + cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + } //namespace mklcpu } //namespace onemkl diff --git a/include/onemkl/blas/detail/mklgpu/blas_ct.hpp b/include/onemkl/blas/detail/mklgpu/blas_ct.hpp index fdbf50223..32fde5f67 100644 --- a/include/onemkl/blas/detail/mklgpu/blas_ct.hpp +++ b/include/onemkl/blas/detail/mklgpu/blas_ct.hpp @@ -33,14 +33,13 @@ #include "onemkl_blas_mklgpu.hpp" +#include "onemkl/blas/detail/blas_ct_templates.hpp" + namespace onemkl { namespace blas { -template -static inline void syr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer &a, std::int64_t lda); +// Buffer APIs + template <> void syr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, @@ -52,11 +51,6 @@ void syr2(cl::sycl::queue &queue, uplo upp syr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda); } -template -static inline void syr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer &a, std::int64_t lda); template <> void syr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, @@ -68,9 +62,6 @@ void syr2(cl::sycl::queue &queue, uplo upp syr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda); } -template -static inline void scal(cl::sycl::queue &queue, std::int64_t n, float alpha, - cl::sycl::buffer &x, std::int64_t incx); template <> void scal(cl::sycl::queue &queue, std::int64_t n, float alpha, cl::sycl::buffer &x, std::int64_t incx) { @@ -79,9 +70,6 @@ void scal(cl::sycl::queue &queue, std::int scal_postcondition(queue, n, alpha, x, incx); } -template -static inline void scal(cl::sycl::queue &queue, std::int64_t n, double alpha, - cl::sycl::buffer &x, std::int64_t incx); template <> void scal(cl::sycl::queue &queue, std::int64_t n, double alpha, cl::sycl::buffer &x, @@ -91,9 +79,6 @@ void scal(cl::sycl::queue &queue, std::int scal_postcondition(queue, n, alpha, x, incx); } -template -static inline void scal(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &x, std::int64_t incx); template <> void scal(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, @@ -104,9 +89,6 @@ void scal(cl::sycl::queue &queue, std::int scal_postcondition(queue, n, alpha, x, incx); } -template -static inline void scal(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &x, std::int64_t incx); template <> void scal(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, @@ -117,9 +99,6 @@ void scal(cl::sycl::queue &queue, std::int scal_postcondition(queue, n, alpha, x, incx); } -template -static inline void scal(cl::sycl::queue &queue, std::int64_t n, float alpha, - cl::sycl::buffer, 1> &x, std::int64_t incx); template <> void scal(cl::sycl::queue &queue, std::int64_t n, float alpha, cl::sycl::buffer, 1> &x, @@ -129,9 +108,6 @@ void scal(cl::sycl::queue &queue, std::int scal_postcondition(queue, n, alpha, x, incx); } -template -static inline void scal(cl::sycl::queue &queue, std::int64_t n, double alpha, - cl::sycl::buffer, 1> &x, std::int64_t incx); template <> void scal(cl::sycl::queue &queue, std::int64_t n, double alpha, @@ -142,10 +118,6 @@ void scal(cl::sycl::queue &queue, std::int scal_postcondition(queue, n, alpha, x, incx); } -template -static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx); template <> void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -156,10 +128,6 @@ void trmv(cl::sycl::queue &queue, uplo upp trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -template -static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx); template <> void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -170,11 +138,6 @@ void trmv(cl::sycl::queue &queue, uplo upp trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -template -static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx); template <> void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -187,11 +150,6 @@ void trmv(cl::sycl::queue &queue, uplo upp trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -template -static inline void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx); template <> void trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -204,10 +162,6 @@ void trmv(cl::sycl::queue &queue, uplo upp trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -template -static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer &a, - cl::sycl::buffer &x, std::int64_t incx); template <> void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -218,10 +172,6 @@ void tpmv(cl::sycl::queue &queue, uplo upp tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -template -static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer &a, - cl::sycl::buffer &x, std::int64_t incx); template <> void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -232,10 +182,6 @@ void tpmv(cl::sycl::queue &queue, uplo upp tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -template -static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &x, std::int64_t incx); template <> void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -247,10 +193,6 @@ void tpmv(cl::sycl::queue &queue, uplo upp tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -template -static inline void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &x, std::int64_t incx); template <> void tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -262,10 +204,6 @@ void tpmv(cl::sycl::queue &queue, uplo upp tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -template -static inline void spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &a); template <> void spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, @@ -276,10 +214,6 @@ void spr(cl::sycl::queue &queue, uplo uppe spr_postcondition(queue, upper_lower, n, alpha, x, incx, a); } -template -static inline void spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &a); template <> void spr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, @@ -290,12 +224,6 @@ void spr(cl::sycl::queue &queue, uplo uppe spr_postcondition(queue, upper_lower, n, alpha, x, incx, a); } -template -static inline void hpmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, cl::sycl::buffer, 1> &y, - std::int64_t incy); template <> void hpmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, @@ -309,12 +237,6 @@ void hpmv(cl::sycl::queue &queue, uplo upp hpmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy); } -template -static inline void hpmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, cl::sycl::buffer, 1> &y, - std::int64_t incy); template <> void hpmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, @@ -328,11 +250,6 @@ void hpmv(cl::sycl::queue &queue, uplo upp hpmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy); } -template -static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, float alpha, cl::sycl::buffer &a, - std::int64_t lda, float beta, cl::sycl::buffer &c, - std::int64_t ldc); template <> void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -344,11 +261,6 @@ void syrk(cl::sycl::queue &queue, uplo upp syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -template -static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, double alpha, cl::sycl::buffer &a, - std::int64_t lda, double beta, cl::sycl::buffer &c, - std::int64_t ldc); template <> void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -360,12 +272,6 @@ void syrk(cl::sycl::queue &queue, uplo upp syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -template -static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void syrk( cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -376,12 +282,6 @@ void syrk( syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -template -static inline void syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void syrk( cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -392,12 +292,6 @@ void syrk( syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -template -static inline void her2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &x, - std::int64_t incx, cl::sycl::buffer, 1> &y, - std::int64_t incy, cl::sycl::buffer, 1> &a, - std::int64_t lda); template <> void her2( cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, @@ -409,12 +303,6 @@ void her2( her2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda); } -template -static inline void her2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &x, - std::int64_t incx, cl::sycl::buffer, 1> &y, - std::int64_t incy, cl::sycl::buffer, 1> &a, - std::int64_t lda); template <> void her2( cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, @@ -426,12 +314,6 @@ void her2( her2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda); } -template -static inline void hbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, - std::complex alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx, std::complex beta, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void hbmv( cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, @@ -443,12 +325,6 @@ void hbmv( hbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void hbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, - std::complex alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx, std::complex beta, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void hbmv( cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, @@ -460,11 +336,6 @@ void hbmv( hbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void rot(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy, float c, - float s); template <> void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -476,11 +347,6 @@ void rot(cl::sycl::queue &queue, std::int6 rot_postcondition(queue, n, x, incx, y, incy, c, s); } -template -static inline void rot(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy, double c, - double s); template <> void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -492,10 +358,6 @@ void rot(cl::sycl::queue &queue, std::int6 rot_postcondition(queue, n, x, incx, y, incy, c, s); } -template -static inline void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, float c, - float s); template <> void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -506,10 +368,6 @@ void rot(cl::sycl::queue &queue, std::int6 rot_postcondition(queue, n, x, incx, y, incy, c, s); } -template -static inline void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, - double c, double s); template <> void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -520,10 +378,6 @@ void rot(cl::sycl::queue &queue, std::int6 rot_postcondition(queue, n, x, incx, y, incy, c, s); } -template -static inline void axpy(cl::sycl::queue &queue, std::int64_t n, float alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &y, std::int64_t incy); template <> void axpy(cl::sycl::queue &queue, std::int64_t n, float alpha, cl::sycl::buffer &x, std::int64_t incx, @@ -533,10 +387,6 @@ void axpy(cl::sycl::queue &queue, std::int axpy_postcondition(queue, n, alpha, x, incx, y, incy); } -template -static inline void axpy(cl::sycl::queue &queue, std::int64_t n, double alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &y, std::int64_t incy); template <> void axpy(cl::sycl::queue &queue, std::int64_t n, double alpha, cl::sycl::buffer &x, @@ -547,10 +397,6 @@ void axpy(cl::sycl::queue &queue, std::int axpy_postcondition(queue, n, alpha, x, incx, y, incy); } -template -static inline void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, @@ -563,10 +409,6 @@ void axpy(cl::sycl::queue &queue, std::int axpy_postcondition(queue, n, alpha, x, incx, y, incy); } -template -static inline void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, @@ -579,12 +421,6 @@ void axpy(cl::sycl::queue &queue, std::int axpy_postcondition(queue, n, alpha, x, incx, y, incy); } -template -static inline void gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &x, - std::int64_t incx, cl::sycl::buffer, 1> &y, - std::int64_t incy, cl::sycl::buffer, 1> &a, - std::int64_t lda); template <> void gerc( cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, @@ -596,12 +432,6 @@ void gerc( gerc_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda); } -template -static inline void gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &x, - std::int64_t incx, cl::sycl::buffer, 1> &y, - std::int64_t incy, cl::sycl::buffer, 1> &a, - std::int64_t lda); template <> void gerc( cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, @@ -613,11 +443,6 @@ void gerc( gerc_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda); } -template -static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, float alpha, cl::sycl::buffer &a, - std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, - float beta, cl::sycl::buffer &c, std::int64_t ldc); template <> void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -630,11 +455,6 @@ void syr2k(cl::sycl::queue &queue, uplo up syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, double alpha, cl::sycl::buffer &a, - std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, - double beta, cl::sycl::buffer &c, std::int64_t ldc); template <> void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -647,13 +467,6 @@ void syr2k(cl::sycl::queue &queue, uplo up syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void syr2k( cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -665,13 +478,6 @@ void syr2k( syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void syr2k( cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -683,11 +489,6 @@ void syr2k( syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - float alpha, cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx, float beta, - cl::sycl::buffer &y, std::int64_t incy); template <> void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha, @@ -700,11 +501,6 @@ void gemv(cl::sycl::queue &queue, transpos gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - double alpha, cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx, double beta, - cl::sycl::buffer &y, std::int64_t incy); template <> void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, @@ -717,12 +513,6 @@ void gemv(cl::sycl::queue &queue, transpos gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx, std::complex beta, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void gemv( cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, @@ -734,12 +524,6 @@ void gemv( gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void gemv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx, std::complex beta, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void gemv( cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, @@ -751,10 +535,6 @@ void gemv( gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &a, std::int64_t lda); template <> void her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, @@ -767,10 +547,6 @@ void her(cl::sycl::queue &queue, uplo uppe her_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda); } -template -static inline void her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &a, std::int64_t lda); template <> void her(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, @@ -783,10 +559,6 @@ void her(cl::sycl::queue &queue, uplo uppe her_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda); } -template -static inline void hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &a); template <> void hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, @@ -798,10 +570,6 @@ void hpr(cl::sycl::queue &queue, uplo uppe hpr_postcondition(queue, upper_lower, n, alpha, x, incx, a); } -template -static inline void hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &a); template <> void hpr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, @@ -813,9 +581,6 @@ void hpr(cl::sycl::queue &queue, uplo uppe hpr_postcondition(queue, upper_lower, n, alpha, x, incx, a); } -template -static inline void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &result); template <> void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -825,9 +590,6 @@ void iamin(cl::sycl::queue &queue, std::in iamin_postcondition(queue, n, x, incx, result); } -template -static inline void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &result); template <> void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -837,10 +599,6 @@ void iamin(cl::sycl::queue &queue, std::in iamin_postcondition(queue, n, x, incx, result); } -template -static inline void iamin(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer &result); template <> void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -851,10 +609,6 @@ void iamin(cl::sycl::queue &queue, std::in iamin_postcondition(queue, n, x, incx, result); } -template -static inline void iamin(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer &result); template <> void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -864,128 +618,7 @@ void iamin(cl::sycl::queue &queue, std::in onemkl::mklgpu::iamin(queue, n, x, incx, result); iamin_postcondition(queue, n, x, incx, result); } -template -static inline void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, - cl::sycl::buffer &m, - cl::sycl::buffer &n, - cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, - cl::sycl::buffer &beta, cl::sycl::buffer &c, - cl::sycl::buffer &ldc, std::int64_t group_count, - cl::sycl::buffer &group_size); -template <> -void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, cl::sycl::buffer &beta, - cl::sycl::buffer &c, cl::sycl::buffer &ldc, std::int64_t group_count, - cl::sycl::buffer &group_size) { - gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); - onemkl::mklgpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); - gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); -} - -template -static inline void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, cl::sycl::buffer &beta, - cl::sycl::buffer &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size); -template <> -void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, cl::sycl::buffer &beta, - cl::sycl::buffer &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size) { - gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); - onemkl::mklgpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); - gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); -} - -template -static inline void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size); -template <> -void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size) { - gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); - onemkl::mklgpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); - gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); -} - -template -static inline void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size); -template <> -void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size) { - gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); - onemkl::mklgpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); - gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - group_count, group_size); -} -template -static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - cl::sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, cl::sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, float beta, - cl::sycl::buffer &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size); template <> void gemm_batch( cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1001,14 +634,6 @@ void gemm_batch( stride_b, beta, c, ldc, stride_c, batch_size); } -template -static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, double alpha, - cl::sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, cl::sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, double beta, - cl::sycl::buffer &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size); template <> void gemm_batch( cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1024,15 +649,6 @@ void gemm_batch( stride_b, beta, c, ldc, stride_c, batch_size); } -template -static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, cl::sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - cl::sycl::buffer, 1> &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size); template <> void gemm_batch( cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1049,15 +665,6 @@ void gemm_batch( stride_b, beta, c, ldc, stride_c, batch_size); } -template -static inline void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, cl::sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - cl::sycl::buffer, 1> &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size); template <> void gemm_batch( cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1074,11 +681,6 @@ void gemm_batch( stride_b, beta, c, ldc, stride_c, batch_size); } -template -static inline void spmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - cl::sycl::buffer &a, cl::sycl::buffer &x, - std::int64_t incx, float beta, cl::sycl::buffer &y, - std::int64_t incy); template <> void spmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, @@ -1091,11 +693,6 @@ void spmv(cl::sycl::queue &queue, uplo upp spmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy); } -template -static inline void spmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - cl::sycl::buffer &a, cl::sycl::buffer &x, - std::int64_t incx, double beta, cl::sycl::buffer &y, - std::int64_t incy); template <> void spmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, @@ -1108,12 +705,6 @@ void spmv(cl::sycl::queue &queue, uplo upp spmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy); } -template -static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb, float beta, - cl::sycl::buffer &c, std::int64_t ldc); template <> void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, @@ -1127,13 +718,6 @@ void gemm_ext(cl::sycl::queue &queue, tran gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, - offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, cl::sycl::buffer &a, std::int64_t lda, - int8_t ao, cl::sycl::buffer &b, std::int64_t ldb, - uint8_t bo, float beta, cl::sycl::buffer &c, - std::int64_t ldc, cl::sycl::buffer &co); template <> void gemm_ext( cl::sycl::queue &queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, @@ -1148,12 +732,6 @@ void gemm_ext( beta, c, ldc, co); } -template -static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb, float beta, - cl::sycl::buffer &c, std::int64_t ldc); template <> void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, @@ -1167,12 +745,6 @@ void gemm_ext(cl::sycl::queue &queue, tran gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, double alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb, double beta, - cl::sycl::buffer &c, std::int64_t ldc); template <> void gemm_ext( cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1184,13 +756,6 @@ void gemm_ext( gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &b, - std::int64_t ldb, std::complex beta, - cl::sycl::buffer, 1> &c, std::int64_t ldc); template <> void gemm_ext( cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1202,14 +767,6 @@ void gemm_ext( gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void gemm_ext( cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1221,12 +778,6 @@ void gemm_ext( gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, half alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb, half beta, - cl::sycl::buffer &c, std::int64_t ldc); template <> void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, @@ -1240,9 +791,6 @@ void gemm_ext(cl::sycl::queue &queue, tran gemm_ext_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy); template <> void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -1252,9 +800,6 @@ void swap(cl::sycl::queue &queue, std::int swap_postcondition(queue, n, x, incx, y, incy); } -template -static inline void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy); template <> void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -1264,10 +809,6 @@ void swap(cl::sycl::queue &queue, std::int swap_postcondition(queue, n, x, incx, y, incy); } -template -static inline void swap(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -1279,10 +820,6 @@ void swap(cl::sycl::queue &queue, std::int swap_postcondition(queue, n, x, incx, y, incy); } -template -static inline void swap(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -1294,12 +831,6 @@ void swap(cl::sycl::queue &queue, std::int swap_postcondition(queue, n, x, incx, y, incy); } -template -static inline void geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &x, - std::int64_t incx, cl::sycl::buffer, 1> &y, - std::int64_t incy, cl::sycl::buffer, 1> &a, - std::int64_t lda); template <> void geru( cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, @@ -1311,12 +842,6 @@ void geru( geru_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda); } -template -static inline void geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &x, - std::int64_t incx, cl::sycl::buffer, 1> &y, - std::int64_t incy, cl::sycl::buffer, 1> &a, - std::int64_t lda); template <> void geru( cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, @@ -1328,10 +853,6 @@ void geru( geru_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda); } -template -static inline void nrm2(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer &result); template <> void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -1342,10 +863,6 @@ void nrm2(cl::sycl::queue &queue, std::int nrm2_postcondition(queue, n, x, incx, result); } -template -static inline void nrm2(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer &result); template <> void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -1356,9 +873,6 @@ void nrm2(cl::sycl::queue &queue, std::int nrm2_postcondition(queue, n, x, incx, result); } -template -static inline void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &result); template <> void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -1368,9 +882,6 @@ void nrm2(cl::sycl::queue &queue, std::int nrm2_postcondition(queue, n, x, incx, result); } -template -static inline void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &result); template <> void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -1380,11 +891,6 @@ void nrm2(cl::sycl::queue &queue, std::int nrm2_postcondition(queue, n, x, incx, result); } -template -static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer &a, - std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, - float beta, cl::sycl::buffer &c, std::int64_t ldc); template <> void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1398,12 +904,6 @@ void gemm(cl::sycl::queue &queue, transpos gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, double alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb, double beta, - cl::sycl::buffer &c, std::int64_t ldc); template <> void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1417,13 +917,6 @@ void gemm(cl::sycl::queue &queue, transpos gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void gemm( cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1435,13 +928,6 @@ void gemm( gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void gemm( cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1453,11 +939,6 @@ void gemm( gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, half alpha, cl::sycl::buffer &a, - std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, half beta, - cl::sycl::buffer &c, std::int64_t ldc); template <> void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, @@ -1471,11 +952,6 @@ void gemm(cl::sycl::queue &queue, transpos gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void herk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, float alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, float beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void herk( cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -1486,11 +962,6 @@ void herk( herk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -template -static inline void herk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, double alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, double beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void herk( cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -1501,11 +972,6 @@ void herk( herk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -template -static inline void ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer &a, std::int64_t lda); template <> void ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, @@ -1517,11 +983,6 @@ void ger(cl::sycl::queue &queue, std::int6 ger_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda); } -template -static inline void ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer &a, std::int64_t lda); template <> void ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, @@ -1533,11 +994,6 @@ void ger(cl::sycl::queue &queue, std::int6 ger_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda); } -template -static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb); template <> void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -1552,11 +1008,6 @@ void trsm(cl::sycl::queue &queue, side lef ldb); } -template -static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb); template <> void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -1571,11 +1022,6 @@ void trsm(cl::sycl::queue &queue, side lef ldb); } -template -static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb); template <> void trsm( cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -1590,11 +1036,6 @@ void trsm( ldb); } -template -static inline void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb); template <> void trsm( cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -1609,11 +1050,6 @@ void trsm( ldb); } -template -static inline void dotu(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy, - cl::sycl::buffer, 1> &result); template <> void dotu(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -1626,11 +1062,6 @@ void dotu(cl::sycl::queue &queue, std::int dotu_postcondition(queue, n, x, incx, y, incy, result); } -template -static inline void dotu(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy, - cl::sycl::buffer, 1> &result); template <> void dotu(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -1643,13 +1074,6 @@ void dotu(cl::sycl::queue &queue, std::int dotu_postcondition(queue, n, x, incx, y, incy, result); } -template -static inline void hemm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void hemm( cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, @@ -1661,13 +1085,6 @@ void hemm( hemm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void hemm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void hemm( cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, @@ -1679,11 +1096,6 @@ void hemm( hemm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void hpr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &x, - std::int64_t incx, cl::sycl::buffer, 1> &y, - std::int64_t incy, cl::sycl::buffer, 1> &a); template <> void hpr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, @@ -1697,11 +1109,6 @@ void hpr2(cl::sycl::queue &queue, uplo upp hpr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a); } -template -static inline void hpr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &x, - std::int64_t incx, cl::sycl::buffer, 1> &y, - std::int64_t incy, cl::sycl::buffer, 1> &a); template <> void hpr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, @@ -1715,12 +1122,6 @@ void hpr2(cl::sycl::queue &queue, uplo upp hpr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a); } -template -static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::int64_t kl, std::int64_t ku, float alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx, float beta, - cl::sycl::buffer &y, std::int64_t incy); template <> void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, @@ -1734,12 +1135,6 @@ void gbmv(cl::sycl::queue &queue, transpos gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::int64_t kl, std::int64_t ku, double alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx, double beta, - cl::sycl::buffer &y, std::int64_t incy); template <> void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, @@ -1753,13 +1148,6 @@ void gbmv(cl::sycl::queue &queue, transpos gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::int64_t kl, std::int64_t ku, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, cl::sycl::buffer, 1> &y, - std::int64_t incy); template <> void gbmv( cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, @@ -1771,13 +1159,6 @@ void gbmv( gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void gbmv(cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::int64_t kl, std::int64_t ku, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, cl::sycl::buffer, 1> &y, - std::int64_t incy); template <> void gbmv( cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, @@ -1789,10 +1170,6 @@ void gbmv( gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, cl::sycl::buffer &a, - std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx); template <> void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -1804,10 +1181,6 @@ void tbmv(cl::sycl::queue &queue, uplo upp tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -template -static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, cl::sycl::buffer &a, - std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx); template <> void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -1819,11 +1192,6 @@ void tbmv(cl::sycl::queue &queue, uplo upp tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -template -static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx); template <> void tbmv( cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -1834,11 +1202,6 @@ void tbmv( tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -template -static inline void tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &x, std::int64_t incx); template <> void tbmv( cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -1849,11 +1212,6 @@ void tbmv( tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -template -static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, float alpha, cl::sycl::buffer &a, - std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, - float beta, cl::sycl::buffer &c, std::int64_t ldc); template <> void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, @@ -1866,11 +1224,6 @@ void symm(cl::sycl::queue &queue, side lef symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, double alpha, cl::sycl::buffer &a, - std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, - double beta, cl::sycl::buffer &c, std::int64_t ldc); template <> void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, @@ -1883,13 +1236,6 @@ void symm(cl::sycl::queue &queue, side lef symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void symm( cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, @@ -1901,13 +1247,6 @@ void symm( symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void symm( cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, @@ -1919,11 +1258,6 @@ void symm( symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void dotc(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy, - cl::sycl::buffer, 1> &result); template <> void dotc(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -1936,11 +1270,6 @@ void dotc(cl::sycl::queue &queue, std::int dotc_postcondition(queue, n, x, incx, y, incy, result); } -template -static inline void dotc(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy, - cl::sycl::buffer, 1> &result); template <> void dotc(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -1953,10 +1282,6 @@ void dotc(cl::sycl::queue &queue, std::int dotc_postcondition(queue, n, x, incx, y, incy, result); } -template -static inline void syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &a, std::int64_t lda); template <> void syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, @@ -1967,10 +1292,6 @@ void syr(cl::sycl::queue &queue, uplo uppe syr_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda); } -template -static inline void syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &a, std::int64_t lda); template <> void syr(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, @@ -1981,11 +1302,6 @@ void syr(cl::sycl::queue &queue, uplo uppe syr_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda); } -template -static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb); template <> void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -2000,11 +1316,6 @@ void trmm(cl::sycl::queue &queue, side lef ldb); } -template -static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb); template <> void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -2019,11 +1330,6 @@ void trmm(cl::sycl::queue &queue, side lef ldb); } -template -static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb); template <> void trmm( cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -2038,11 +1344,6 @@ void trmm( ldb); } -template -static inline void trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb); template <> void trmm( cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -2057,10 +1358,6 @@ void trmm( ldb); } -template -static inline void rotmg(cl::sycl::queue &queue, cl::sycl::buffer &d1, - cl::sycl::buffer &d2, cl::sycl::buffer &x1, float y1, - cl::sycl::buffer ¶m); template <> void rotmg(cl::sycl::queue &queue, cl::sycl::buffer &d1, @@ -2072,10 +1369,6 @@ void rotmg(cl::sycl::queue &queue, rotmg_postcondition(queue, d1, d2, x1, y1, param); } -template -static inline void rotmg(cl::sycl::queue &queue, cl::sycl::buffer &d1, - cl::sycl::buffer &d2, cl::sycl::buffer &x1, - double y1, cl::sycl::buffer ¶m); template <> void rotmg(cl::sycl::queue &queue, cl::sycl::buffer &d1, @@ -2087,10 +1380,6 @@ void rotmg(cl::sycl::queue &queue, rotmg_postcondition(queue, d1, d2, x1, y1, param); } -template -static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer &a, - cl::sycl::buffer &x, std::int64_t incx); template <> void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2101,10 +1390,6 @@ void tpsv(cl::sycl::queue &queue, uplo upp tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -template -static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer &a, - cl::sycl::buffer &x, std::int64_t incx); template <> void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2115,10 +1400,6 @@ void tpsv(cl::sycl::queue &queue, uplo upp tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -template -static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &x, std::int64_t incx); template <> void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2130,10 +1411,6 @@ void tpsv(cl::sycl::queue &queue, uplo upp tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -template -static inline void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &x, std::int64_t incx); template <> void tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2145,10 +1422,6 @@ void tpsv(cl::sycl::queue &queue, uplo upp tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -template -static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx); template <> void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2159,10 +1432,6 @@ void trsv(cl::sycl::queue &queue, uplo upp trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -template -static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx); template <> void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2173,11 +1442,6 @@ void trsv(cl::sycl::queue &queue, uplo upp trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -template -static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx); template <> void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2190,11 +1454,6 @@ void trsv(cl::sycl::queue &queue, uplo upp trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -template -static inline void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx); template <> void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2207,9 +1466,6 @@ void trsv(cl::sycl::queue &queue, uplo upp trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -template -static inline void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy); template <> void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2219,9 +1475,6 @@ void copy(cl::sycl::queue &queue, std::int copy_postcondition(queue, n, x, incx, y, incy); } -template -static inline void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy); template <> void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2231,10 +1484,6 @@ void copy(cl::sycl::queue &queue, std::int copy_postcondition(queue, n, x, incx, y, incy); } -template -static inline void copy(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -2246,10 +1495,6 @@ void copy(cl::sycl::queue &queue, std::int copy_postcondition(queue, n, x, incx, y, incy); } -template -static inline void copy(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -2261,12 +1506,6 @@ void copy(cl::sycl::queue &queue, std::int copy_postcondition(queue, n, x, incx, y, incy); } -template -static inline void hemv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx, std::complex beta, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void hemv( cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, @@ -2278,12 +1517,6 @@ void hemv( hemv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void hemv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx, std::complex beta, - cl::sycl::buffer, 1> &y, std::int64_t incy); template <> void hemv( cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, @@ -2295,12 +1528,6 @@ void hemv( hemv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, - transpose transb, std::int64_t n, std::int64_t k, float alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb, float beta, - cl::sycl::buffer &c, std::int64_t ldc); template <> void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, @@ -2317,12 +1544,6 @@ void gemmt(cl::sycl::queue &queue, uplo up ldc); } -template -static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, - transpose transb, std::int64_t n, std::int64_t k, double alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &b, std::int64_t ldb, double beta, - cl::sycl::buffer &c, std::int64_t ldc); template <> void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, @@ -2339,13 +1560,6 @@ void gemmt(cl::sycl::queue &queue, uplo up ldc); } -template -static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, - transpose transb, std::int64_t n, std::int64_t k, - std::complex alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &b, - std::int64_t ldb, std::complex beta, - cl::sycl::buffer, 1> &c, std::int64_t ldc); template <> void gemmt( cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, @@ -2360,13 +1574,6 @@ void gemmt( ldc); } -template -static inline void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, - transpose transb, std::int64_t n, std::int64_t k, - std::complex alpha, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &b, - std::int64_t ldb, std::complex beta, - cl::sycl::buffer, 1> &c, std::int64_t ldc); template <> void gemmt( cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, @@ -2381,11 +1588,6 @@ void gemmt( ldc); } -template -static inline void sbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, - float alpha, cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx, float beta, - cl::sycl::buffer &y, std::int64_t incy); template <> void sbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, float alpha, @@ -2398,11 +1600,6 @@ void sbmv(cl::sycl::queue &queue, uplo upp sbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void sbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, - double alpha, cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx, double beta, - cl::sycl::buffer &y, std::int64_t incy); template <> void sbmv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, double alpha, @@ -2415,10 +1612,6 @@ void sbmv(cl::sycl::queue &queue, uplo upp sbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void asum(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer &result); template <> void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -2429,10 +1622,6 @@ void asum(cl::sycl::queue &queue, std::int asum_postcondition(queue, n, x, incx, result); } -template -static inline void asum(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer &result); template <> void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -2443,9 +1632,6 @@ void asum(cl::sycl::queue &queue, std::int asum_postcondition(queue, n, x, incx, result); } -template -static inline void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &result); template <> void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2455,9 +1641,6 @@ void asum(cl::sycl::queue &queue, std::int asum_postcondition(queue, n, x, incx, result); } -template -static inline void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &result); template <> void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2467,10 +1650,6 @@ void asum(cl::sycl::queue &queue, std::int asum_postcondition(queue, n, x, incx, result); } -template -static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, cl::sycl::buffer &a, - std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx); template <> void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2482,10 +1661,6 @@ void tbsv(cl::sycl::queue &queue, uplo upp tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -template -static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, cl::sycl::buffer &a, - std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx); template <> void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2497,11 +1672,6 @@ void tbsv(cl::sycl::queue &queue, uplo upp tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -template -static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, cl::sycl::buffer, 1> &a, - std::int64_t lda, cl::sycl::buffer, 1> &x, - std::int64_t incx); template <> void tbsv( cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2512,11 +1682,6 @@ void tbsv( tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -template -static inline void tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &x, std::int64_t incx); template <> void tbsv( cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, @@ -2527,11 +1692,6 @@ void tbsv( tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -template -static inline void spr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer &a); template <> void spr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, @@ -2543,11 +1703,6 @@ void spr2(cl::sycl::queue &queue, uplo upp spr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a); } -template -static inline void spr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer &a); template <> void spr2(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, @@ -2559,9 +1714,6 @@ void spr2(cl::sycl::queue &queue, uplo upp spr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a); } -template -static inline void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &result); template <> void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2571,9 +1723,6 @@ void iamax(cl::sycl::queue &queue, std::in iamax_postcondition(queue, n, x, incx, result); } -template -static inline void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &result); template <> void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2583,10 +1732,6 @@ void iamax(cl::sycl::queue &queue, std::in iamax_postcondition(queue, n, x, incx, result); } -template -static inline void iamax(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer &result); template <> void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -2597,10 +1742,6 @@ void iamax(cl::sycl::queue &queue, std::in iamax_postcondition(queue, n, x, incx, result); } -template -static inline void iamax(cl::sycl::queue &queue, std::int64_t n, - cl::sycl::buffer, 1> &x, std::int64_t incx, - cl::sycl::buffer &result); template <> void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, @@ -2611,118 +1752,6 @@ void iamax(cl::sycl::queue &queue, std::in iamax_postcondition(queue, n, x, incx, result); } -template -static inline void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, - cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, - cl::sycl::buffer &m, - cl::sycl::buffer &n, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, std::int64_t group_count, - cl::sycl::buffer &group_size); -template <> -void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, std::int64_t group_count, - cl::sycl::buffer &group_size) { - trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); - onemkl::mklgpu::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, - lda, b, ldb, group_count, group_size); - trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); -} - -template -static inline void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size); -template <> -void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { - trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); - onemkl::mklgpu::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, - lda, b, ldb, group_count, group_size); - trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); -} - -template -static inline void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size); -template <> -void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { - trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); - onemkl::mklgpu::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, - lda, b, ldb, group_count, group_size); - trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); -} - -template -static inline void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size); -template <> -void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { - trsm_batch_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); - onemkl::mklgpu::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, - lda, b, ldb, group_count, group_size); - trsm_batch_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, - b, ldb, group_count, group_size); -} - -template -static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - float alpha, cl::sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, cl::sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template <> void trsm_batch( cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -2737,12 +1766,6 @@ void trsm_batch( stride_a, b, ldb, stride_b, batch_size); } -template -static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - double alpha, cl::sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, cl::sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template <> void trsm_batch( cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -2757,13 +1780,6 @@ void trsm_batch( stride_a, b, ldb, stride_b, batch_size); } -template -static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, cl::sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template <> void trsm_batch( cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -2779,13 +1795,6 @@ void trsm_batch( stride_a, b, ldb, stride_b, batch_size); } -template -static inline void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, cl::sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template <> void trsm_batch( cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, @@ -2801,10 +1810,6 @@ void trsm_batch( stride_a, b, ldb, stride_b, batch_size); } -template -static inline void rotm(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer ¶m); template <> void rotm(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2815,10 +1820,6 @@ void rotm(cl::sycl::queue &queue, std::int rotm_postcondition(queue, n, x, incx, y, incy, param); } -template -static inline void rotm(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer ¶m); template <> void rotm(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2829,10 +1830,6 @@ void rotm(cl::sycl::queue &queue, std::int rotm_postcondition(queue, n, x, incx, y, incy, param); } -template -static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer &a, - cl::sycl::buffer &b, cl::sycl::buffer &c, - cl::sycl::buffer &s); template <> void rotg(cl::sycl::queue &queue, cl::sycl::buffer &a, @@ -2844,10 +1841,6 @@ void rotg(cl::sycl::queue &queue, rotg_postcondition(queue, a, b, c, s); } -template -static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer &a, - cl::sycl::buffer &b, cl::sycl::buffer &c, - cl::sycl::buffer &s); template <> void rotg(cl::sycl::queue &queue, cl::sycl::buffer &a, @@ -2859,10 +1852,6 @@ void rotg(cl::sycl::queue &queue, rotg_postcondition(queue, a, b, c, s); } -template -static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &c, - cl::sycl::buffer, 1> &s); template <> void rotg(cl::sycl::queue &queue, cl::sycl::buffer, 1> &a, @@ -2874,11 +1863,6 @@ void rotg(cl::sycl::queue &queue, rotg_postcondition(queue, a, b, c, s); } -template -static inline void rotg(cl::sycl::queue &queue, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &b, - cl::sycl::buffer &c, - cl::sycl::buffer, 1> &s); template <> void rotg(cl::sycl::queue &queue, cl::sycl::buffer, 1> &a, @@ -2890,11 +1874,6 @@ void rotg(cl::sycl::queue &queue, rotg_postcondition(queue, a, b, c, s); } -template -static inline void sdsdot(cl::sycl::queue &queue, std::int64_t n, float sb, - cl::sycl::buffer &x, std::int64_t incx, - cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer &result); template <> void sdsdot(cl::sycl::queue &queue, std::int64_t n, float sb, cl::sycl::buffer &x, std::int64_t incx, @@ -2905,12 +1884,6 @@ void sdsdot(cl::sycl::queue &queue, std::i sdsdot_postcondition(queue, n, sb, x, incx, y, incy, result); } -template -static inline void her2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, float beta, - cl::sycl::buffer, 1> &c, std::int64_t ldc); template <> void her2k( cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -2922,13 +1895,6 @@ void her2k( her2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void her2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - cl::sycl::buffer, 1> &a, std::int64_t lda, - cl::sycl::buffer, 1> &b, std::int64_t ldb, - double beta, cl::sycl::buffer, 1> &c, - std::int64_t ldc); template <> void her2k( cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, @@ -2940,10 +1906,6 @@ void her2k( her2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -template -static inline void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer &result); template <> void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2954,10 +1916,6 @@ void dot(cl::sycl::queue &queue, std::int6 dot_postcondition(queue, n, x, incx, y, incy, result); } -template -static inline void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer &result); template <> void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2968,10 +1926,6 @@ void dot(cl::sycl::queue &queue, std::int6 dot_postcondition(queue, n, x, incx, y, incy, result); } -template -static inline void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, - std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, - cl::sycl::buffer &result); template <> void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, @@ -2982,11 +1936,6 @@ void dot(cl::sycl::queue &queue, std::int6 dot_postcondition(queue, n, x, incx, y, incy, result); } -template -static inline void symv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx, float beta, - cl::sycl::buffer &y, std::int64_t incy); template <> void symv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, @@ -2999,11 +1948,6 @@ void symv(cl::sycl::queue &queue, uplo upp symv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } -template -static inline void symv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - cl::sycl::buffer &a, std::int64_t lda, - cl::sycl::buffer &x, std::int64_t incx, double beta, - cl::sycl::buffer &y, std::int64_t incy); template <> void symv(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, @@ -3016,6 +1960,2067 @@ void symv(cl::sycl::queue &queue, uplo upp symv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } +// USM APIs + +template <> +cl::sycl::event syr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + syr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = + onemkl::mklgpu::syr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + syr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event syr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x, + std::int64_t incx, const double *y, std::int64_t incy, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + syr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = + onemkl::mklgpu::syr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + syr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, float alpha, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + scal_precondition(queue, n, alpha, x, incx, dependencies); + auto done = onemkl::mklgpu::scal(queue, n, alpha, x, incx, dependencies); + scal_postcondition(queue, n, alpha, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, double alpha, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + scal_precondition(queue, n, alpha, x, incx, dependencies); + auto done = onemkl::mklgpu::scal(queue, n, alpha, x, incx, dependencies); + scal_postcondition(queue, n, alpha, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, std::complex alpha, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies) { + scal_precondition(queue, n, alpha, x, incx, dependencies); + auto done = onemkl::mklgpu::scal(queue, n, alpha, x, incx, dependencies); + scal_postcondition(queue, n, alpha, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, std::complex alpha, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies) { + scal_precondition(queue, n, alpha, x, incx, dependencies); + auto done = onemkl::mklgpu::scal(queue, n, alpha, x, incx, dependencies); + scal_postcondition(queue, n, alpha, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, float alpha, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + scal_precondition(queue, n, alpha, x, incx, dependencies); + auto done = onemkl::mklgpu::scal(queue, n, alpha, x, incx, dependencies); + scal_postcondition(queue, n, alpha, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, double alpha, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies) { + scal_precondition(queue, n, alpha, x, incx, dependencies); + auto done = onemkl::mklgpu::scal(queue, n, alpha, x, incx, dependencies); + scal_postcondition(queue, n, alpha, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event trmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const float *a, std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = onemkl::mklgpu::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); + trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event trmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const double *a, std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = onemkl::mklgpu::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); + trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event trmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = onemkl::mklgpu::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); + trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event trmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + trmv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = onemkl::mklgpu::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); + trmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tpmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const float *a, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = + onemkl::mklgpu::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tpmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const double *a, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = + onemkl::mklgpu::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tpmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = + onemkl::mklgpu::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tpmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tpmv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = + onemkl::mklgpu::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + tpmv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event spr( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x, + std::int64_t incx, float *a, const cl::sycl::vector_class &dependencies) { + spr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + auto done = onemkl::mklgpu::spr(queue, upper_lower, n, alpha, x, incx, a, dependencies); + spr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + return done; +} + +template <> +cl::sycl::event spr( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x, + std::int64_t incx, double *a, const cl::sycl::vector_class &dependencies) { + spr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + auto done = onemkl::mklgpu::spr(queue, upper_lower, n, alpha, x, incx, a, dependencies); + spr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + return done; +} + +template <> +cl::sycl::event hpmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *a, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + hpmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + auto done = + onemkl::mklgpu::hpmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + hpmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event hpmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *a, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + hpmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + auto done = + onemkl::mklgpu::hpmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + hpmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event syrk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + float alpha, const float *a, std::int64_t lda, float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + auto done = onemkl::mklgpu::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, + dependencies); + syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + return done; +} + +template <> +cl::sycl::event syrk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + double alpha, const double *a, std::int64_t lda, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + auto done = onemkl::mklgpu::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, + dependencies); + syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + return done; +} + +template <> +cl::sycl::event syrk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + auto done = onemkl::mklgpu::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, + dependencies); + syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + return done; +} + +template <> +cl::sycl::event syrk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + syrk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + auto done = onemkl::mklgpu::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, + dependencies); + syrk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + return done; +} + +template <> +cl::sycl::event her2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + her2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = + onemkl::mklgpu::her2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + her2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event her2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + her2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = + onemkl::mklgpu::her2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + her2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event hbmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + hbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = onemkl::mklgpu::hbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, + incy, dependencies); + hbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +template <> +cl::sycl::event hbmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + hbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = onemkl::mklgpu::hbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, + incy, dependencies); + hbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +template <> +cl::sycl::event rot( + cl::sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, float c, float s, + const cl::sycl::vector_class &dependencies) { + rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies); + auto done = onemkl::mklgpu::rot(queue, n, x, incx, y, incy, c, s, dependencies); + rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies); + return done; +} + +template <> +cl::sycl::event rot( + cl::sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, double c, double s, + const cl::sycl::vector_class &dependencies) { + rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies); + auto done = onemkl::mklgpu::rot(queue, n, x, incx, y, incy, c, s, dependencies); + rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies); + return done; +} + +template <> +cl::sycl::event rot( + cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, float c, float s, + const cl::sycl::vector_class &dependencies) { + rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies); + auto done = onemkl::mklgpu::rot(queue, n, x, incx, y, incy, c, s, dependencies); + rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies); + return done; +} + +template <> +cl::sycl::event rot( + cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, + std::int64_t incy, double c, double s, + const cl::sycl::vector_class &dependencies) { + rot_precondition(queue, n, x, incx, y, incy, c, s, dependencies); + auto done = onemkl::mklgpu::rot(queue, n, x, incx, y, incy, c, s, dependencies); + rot_postcondition(queue, n, x, incx, y, incy, c, s, dependencies); + return done; +} + +template <> +cl::sycl::event axpy( + cl::sycl::queue &queue, std::int64_t n, float alpha, const float *x, std::int64_t incx, + float *y, std::int64_t incy, const cl::sycl::vector_class &dependencies) { + axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies); + auto done = onemkl::mklgpu::axpy(queue, n, alpha, x, incx, y, incy, dependencies); + axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event axpy( + cl::sycl::queue &queue, std::int64_t n, double alpha, const double *x, std::int64_t incx, + double *y, std::int64_t incy, const cl::sycl::vector_class &dependencies) { + axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies); + auto done = onemkl::mklgpu::axpy(queue, n, alpha, x, incx, y, incy, dependencies); + axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event axpy( + cl::sycl::queue &queue, std::int64_t n, std::complex alpha, const std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies); + auto done = onemkl::mklgpu::axpy(queue, n, alpha, x, incx, y, incy, dependencies); + axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event axpy( + cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + axpy_precondition(queue, n, alpha, x, incx, y, incy, dependencies); + auto done = onemkl::mklgpu::axpy(queue, n, alpha, x, incx, y, incy, dependencies); + axpy_postcondition(queue, n, alpha, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event axpy_batch( + cl::sycl::queue &queue, std::int64_t *n, float *alpha, const float **x, std::int64_t *incx, + float **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + auto done = onemkl::mklgpu::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count, + group_size, dependencies); + axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + return done; +} + +template <> +cl::sycl::event axpy_batch( + cl::sycl::queue &queue, std::int64_t *n, double *alpha, const double **x, std::int64_t *incx, + double **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + auto done = onemkl::mklgpu::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count, + group_size, dependencies); + axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + return done; +} + +template <> +cl::sycl::event axpy_batch( + cl::sycl::queue &queue, std::int64_t *n, std::complex *alpha, + const std::complex **x, std::int64_t *incx, std::complex **y, std::int64_t *incy, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + auto done = onemkl::mklgpu::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count, + group_size, dependencies); + axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + return done; +} + +template <> +cl::sycl::event axpy_batch( + cl::sycl::queue &queue, std::int64_t *n, std::complex *alpha, + const std::complex **x, std::int64_t *incx, std::complex **y, + std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + axpy_batch_precondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + auto done = onemkl::mklgpu::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count, + group_size, dependencies); + axpy_batch_postcondition(queue, n, alpha, x, incx, y, incy, group_count, group_size, + dependencies); + return done; +} + +template <> +cl::sycl::event gerc( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + gerc_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = onemkl::mklgpu::gerc(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + gerc_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event gerc( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + gerc_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = onemkl::mklgpu::gerc(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + gerc_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event syr2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, float beta, + float *c, std::int64_t ldc, const cl::sycl::vector_class &dependencies) { + syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklgpu::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, + c, ldc, dependencies); + syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event syr2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + double alpha, const double *a, std::int64_t lda, const double *b, std::int64_t ldb, double beta, + double *c, std::int64_t ldc, const cl::sycl::vector_class &dependencies) { + syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklgpu::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, + c, ldc, dependencies); + syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event syr2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklgpu::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, + c, ldc, dependencies); + syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event syr2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + syr2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklgpu::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, + c, ldc, dependencies); + syr2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event gemv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha, + const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies) { + gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = onemkl::mklgpu::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event gemv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, + const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies) { + gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = onemkl::mklgpu::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event gemv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = onemkl::mklgpu::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event gemv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + gemv_precondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = onemkl::mklgpu::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + gemv_postcondition(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event her( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const std::complex *x, std::int64_t incx, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + her_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + auto done = onemkl::mklgpu::her(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + her_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event her( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + const std::complex *x, std::int64_t incx, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + her_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + auto done = onemkl::mklgpu::her(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + her_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event hpr( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const std::complex *x, std::int64_t incx, std::complex *a, + const cl::sycl::vector_class &dependencies) { + hpr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + auto done = onemkl::mklgpu::hpr(queue, upper_lower, n, alpha, x, incx, a, dependencies); + hpr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + return done; +} + +template <> +cl::sycl::event hpr( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + const std::complex *x, std::int64_t incx, std::complex *a, + const cl::sycl::vector_class &dependencies) { + hpr_precondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + auto done = onemkl::mklgpu::hpr(queue, upper_lower, n, alpha, x, incx, a, dependencies); + hpr_postcondition(queue, upper_lower, n, alpha, x, incx, a, dependencies); + return done; +} + +template <> +cl::sycl::event iamin( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { + iamin_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklgpu::iamin(queue, n, x, incx, result, dependencies); + iamin_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event iamin( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies) { + iamin_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklgpu::iamin(queue, n, x, incx, result, dependencies); + iamin_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event iamin( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies) { + iamin_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklgpu::iamin(queue, n, x, incx, result, dependencies); + iamin_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event iamin( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies) { + iamin_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklgpu::iamin(queue, n, x, incx, result, dependencies); + iamin_postcondition(queue, n, x, incx, result, dependencies); + return done; +} +template <> +cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, float *alpha, const float **a, std::int64_t *lda, const float **b, + std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, std::int64_t group_count, + std::int64_t *group_size, const cl::sycl::vector_class &dependencies) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + auto done = onemkl::mklgpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, + beta, c, ldc, group_count, group_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + return done; +} + +template <> +cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, double *alpha, const double **a, std::int64_t *lda, const double **b, + std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc, std::int64_t group_count, + std::int64_t *group_size, const cl::sycl::vector_class &dependencies) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + auto done = onemkl::mklgpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, + beta, c, ldc, group_count, group_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + return done; +} + +template <> +cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, std::complex *alpha, const std::complex **a, std::int64_t *lda, + const std::complex **b, std::int64_t *ldb, std::complex *beta, + std::complex **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + auto done = onemkl::mklgpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, + beta, c, ldc, group_count, group_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + return done; +} + +template <> +cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, std::complex *alpha, const std::complex **a, std::int64_t *lda, + const std::complex **b, std::int64_t *ldb, std::complex *beta, + std::complex **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + auto done = onemkl::mklgpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, + beta, c, ldc, group_count, group_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + group_count, group_size, dependencies); + return done; +} + +template <> +cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const float *a, std::int64_t lda, std::int64_t stride_a, + const float *b, std::int64_t ldb, std::int64_t stride_b, float beta, float *c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const cl::sycl::vector_class &dependencies) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + auto done = + onemkl::mklgpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + return done; +} + +template <> +cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, double alpha, const double *a, std::int64_t lda, std::int64_t stride_a, + const double *b, std::int64_t ldb, std::int64_t stride_b, double beta, double *c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const cl::sycl::vector_class &dependencies) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + auto done = + onemkl::mklgpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + return done; +} + +template <> +cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + std::int64_t stride_a, const std::complex *b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, std::complex *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const cl::sycl::vector_class &dependencies) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + auto done = + onemkl::mklgpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + return done; +} + +template <> +cl::sycl::event gemm_batch( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + std::int64_t stride_a, const std::complex *b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, std::complex *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const cl::sycl::vector_class &dependencies) { + gemm_batch_precondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + auto done = + onemkl::mklgpu::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + gemm_batch_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, + stride_b, beta, c, ldc, stride_c, batch_size, dependencies); + return done; +} + +template <> +cl::sycl::event spmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *a, + const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + spmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + auto done = + onemkl::mklgpu::spmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + spmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event spmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *a, + const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + spmv_precondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + auto done = + onemkl::mklgpu::spmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + spmv_postcondition(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event swap( + cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies) { + swap_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = onemkl::mklgpu::swap(queue, n, x, incx, y, incy, dependencies); + swap_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event swap( + cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies) { + swap_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = onemkl::mklgpu::swap(queue, n, x, incx, y, incy, dependencies); + swap_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event swap( + cl::sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + swap_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = onemkl::mklgpu::swap(queue, n, x, incx, y, incy, dependencies); + swap_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event swap( + cl::sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + swap_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = onemkl::mklgpu::swap(queue, n, x, incx, y, incy, dependencies); + swap_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event geru( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + geru_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = onemkl::mklgpu::geru(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + geru_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event geru( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + geru_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = onemkl::mklgpu::geru(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + geru_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event nrm2( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + float *result, const cl::sycl::vector_class &dependencies) { + nrm2_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklgpu::nrm2(queue, n, x, incx, result, dependencies); + nrm2_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event nrm2( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + double *result, const cl::sycl::vector_class &dependencies) { + nrm2_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklgpu::nrm2(queue, n, x, incx, result, dependencies); + nrm2_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event nrm2( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies) { + nrm2_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklgpu::nrm2(queue, n, x, incx, result, dependencies); + nrm2_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event nrm2( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies) { + nrm2_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklgpu::nrm2(queue, n, x, incx, result, dependencies); + nrm2_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event gemm( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, + float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklgpu::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event gemm( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *b, + std::int64_t ldb, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklgpu::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event gemm( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklgpu::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event gemm( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + gemm_precondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklgpu::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + gemm_postcondition(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event herk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + float alpha, const std::complex *a, std::int64_t lda, float beta, std::complex *c, + std::int64_t ldc, const cl::sycl::vector_class &dependencies) { + herk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + auto done = onemkl::mklgpu::herk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, + dependencies); + herk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + return done; +} + +template <> +cl::sycl::event herk( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + double alpha, const std::complex *a, std::int64_t lda, double beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + herk_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + auto done = onemkl::mklgpu::herk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, + dependencies); + herk_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); + return done; +} + +template <> +cl::sycl::event ger( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + ger_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = onemkl::mklgpu::ger(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + ger_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event ger( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, const double *x, + std::int64_t incx, const double *y, std::int64_t incy, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + ger_precondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + auto done = onemkl::mklgpu::ger(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + ger_postcondition(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event trsm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b, + std::int64_t ldb, const cl::sycl::vector_class &dependencies) { + trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = onemkl::mklgpu::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +template <> +cl::sycl::event trsm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b, + std::int64_t ldb, const cl::sycl::vector_class &dependencies) { + trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = onemkl::mklgpu::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +template <> +cl::sycl::event trsm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = onemkl::mklgpu::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +template <> +cl::sycl::event trsm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + trsm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = onemkl::mklgpu::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + trsm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +template <> +cl::sycl::event dotu( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies) { + dotu_precondition(queue, n, x, incx, y, incy, result, dependencies); + auto done = onemkl::mklgpu::dotu(queue, n, x, incx, y, incy, result, dependencies); + dotu_postcondition(queue, n, x, incx, y, incy, result, dependencies); + return done; +} + +template <> +cl::sycl::event dotu( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies) { + dotu_precondition(queue, n, x, incx, y, incy, result, dependencies); + auto done = onemkl::mklgpu::dotu(queue, n, x, incx, y, incy, result, dependencies); + dotu_postcondition(queue, n, x, incx, y, incy, result, dependencies); + return done; +} + +template <> +cl::sycl::event hemm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + hemm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklgpu::hemm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, + beta, c, ldc, dependencies); + hemm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event hemm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + hemm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklgpu::hemm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, + beta, c, ldc, dependencies); + hemm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event hpr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, + const cl::sycl::vector_class &dependencies) { + hpr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + auto done = + onemkl::mklgpu::hpr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + hpr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + return done; +} + +template <> +cl::sycl::event hpr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, + const cl::sycl::vector_class &dependencies) { + hpr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + auto done = + onemkl::mklgpu::hpr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + hpr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + return done; +} + +template <> +cl::sycl::event gbmv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, float alpha, const float *a, std::int64_t lda, const float *x, + std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = onemkl::mklgpu::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, + incy, dependencies); + gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +template <> +cl::sycl::event gbmv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, double alpha, const double *a, std::int64_t lda, const double *x, + std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = onemkl::mklgpu::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, + incy, dependencies); + gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +template <> +cl::sycl::event gbmv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = onemkl::mklgpu::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, + incy, dependencies); + gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +template <> +cl::sycl::event gbmv( + cl::sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + gbmv_precondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = onemkl::mklgpu::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, + incy, dependencies); + gbmv_postcondition(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +template <> +cl::sycl::event tbmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = onemkl::mklgpu::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, + dependencies); + tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tbmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = onemkl::mklgpu::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, + dependencies); + tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tbmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies) { + tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = onemkl::mklgpu::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, + dependencies); + tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tbmv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies) { + tbmv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = onemkl::mklgpu::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, + dependencies); + tbmv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event symm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, float beta, + float *c, std::int64_t ldc, const cl::sycl::vector_class &dependencies) { + symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklgpu::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, + beta, c, ldc, dependencies); + symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event symm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + double alpha, const double *a, std::int64_t lda, const double *b, std::int64_t ldb, double beta, + double *c, std::int64_t ldc, const cl::sycl::vector_class &dependencies) { + symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklgpu::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, + beta, c, ldc, dependencies); + symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event symm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklgpu::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, + beta, c, ldc, dependencies); + symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event symm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + symm_precondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklgpu::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, + beta, c, ldc, dependencies); + symm_postcondition(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event dotc( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies) { + dotc_precondition(queue, n, x, incx, y, incy, result, dependencies); + auto done = onemkl::mklgpu::dotc(queue, n, x, incx, y, incy, result, dependencies); + dotc_postcondition(queue, n, x, incx, y, incy, result, dependencies); + return done; +} + +template <> +cl::sycl::event dotc( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies) { + dotc_precondition(queue, n, x, incx, y, incy, result, dependencies); + auto done = onemkl::mklgpu::dotc(queue, n, x, incx, y, incy, result, dependencies); + dotc_postcondition(queue, n, x, incx, y, incy, result, dependencies); + return done; +} + +template <> +cl::sycl::event syr( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x, + std::int64_t incx, float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + syr_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + auto done = onemkl::mklgpu::syr(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + syr_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event syr( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x, + std::int64_t incx, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + syr_precondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + auto done = onemkl::mklgpu::syr(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + syr_postcondition(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); + return done; +} + +template <> +cl::sycl::event trmm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b, + std::int64_t ldb, const cl::sycl::vector_class &dependencies) { + trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = onemkl::mklgpu::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +template <> +cl::sycl::event trmm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b, + std::int64_t ldb, const cl::sycl::vector_class &dependencies) { + trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = onemkl::mklgpu::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +template <> +cl::sycl::event trmm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = onemkl::mklgpu::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +template <> +cl::sycl::event trmm( + cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + trmm_precondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, + dependencies); + auto done = onemkl::mklgpu::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, + a, lda, b, ldb, dependencies); + trmm_postcondition(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, dependencies); + return done; +} + +template <> +cl::sycl::event rotmg( + cl::sycl::queue &queue, float *d1, float *d2, float *x1, float y1, float *param, + const cl::sycl::vector_class &dependencies) { + rotmg_precondition(queue, d1, d2, x1, y1, param, dependencies); + auto done = onemkl::mklgpu::rotmg(queue, d1, d2, x1, y1, param, dependencies); + rotmg_postcondition(queue, d1, d2, x1, y1, param, dependencies); + return done; +} + +template <> +cl::sycl::event rotmg( + cl::sycl::queue &queue, double *d1, double *d2, double *x1, double y1, double *param, + const cl::sycl::vector_class &dependencies) { + rotmg_precondition(queue, d1, d2, x1, y1, param, dependencies); + auto done = onemkl::mklgpu::rotmg(queue, d1, d2, x1, y1, param, dependencies); + rotmg_postcondition(queue, d1, d2, x1, y1, param, dependencies); + return done; +} + +template <> +cl::sycl::event tpsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const float *a, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = + onemkl::mklgpu::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tpsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const double *a, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = + onemkl::mklgpu::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tpsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = + onemkl::mklgpu::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tpsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tpsv_precondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + auto done = + onemkl::mklgpu::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + tpsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event trsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const float *a, std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = onemkl::mklgpu::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); + trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event trsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const double *a, std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = onemkl::mklgpu::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); + trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event trsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = onemkl::mklgpu::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); + trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event trsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + const std::complex *a, std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + trsv_precondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + auto done = onemkl::mklgpu::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); + trsv_postcondition(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event copy( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies) { + copy_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = onemkl::mklgpu::copy(queue, n, x, incx, y, incy, dependencies); + copy_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event copy( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies) { + copy_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = onemkl::mklgpu::copy(queue, n, x, incx, y, incy, dependencies); + copy_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event copy( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + copy_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = onemkl::mklgpu::copy(queue, n, x, incx, y, incy, dependencies); + copy_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event copy( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + copy_precondition(queue, n, x, incx, y, incy, dependencies); + auto done = onemkl::mklgpu::copy(queue, n, x, incx, y, incy, dependencies); + copy_postcondition(queue, n, x, incx, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event hemv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + hemv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = onemkl::mklgpu::hemv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + hemv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event hemv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *x, + std::int64_t incx, std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + hemv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = onemkl::mklgpu::hemv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + hemv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event gemmt( + cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, + float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + auto done = onemkl::mklgpu::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, + ldb, beta, c, ldc, dependencies); + gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + return done; +} + +template <> +cl::sycl::event gemmt( + cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *b, + std::int64_t ldb, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + auto done = onemkl::mklgpu::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, + ldb, beta, c, ldc, dependencies); + gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + return done; +} + +template <> +cl::sycl::event gemmt( + cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + auto done = onemkl::mklgpu::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, + ldb, beta, c, ldc, dependencies); + gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + return done; +} + +template <> +cl::sycl::event gemmt( + cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + gemmt_precondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + auto done = onemkl::mklgpu::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, + ldb, beta, c, ldc, dependencies); + gemmt_postcondition(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, + ldc, dependencies); + return done; +} + +template <> +cl::sycl::event sbmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, float alpha, + const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies) { + sbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = onemkl::mklgpu::sbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, + incy, dependencies); + sbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +template <> +cl::sycl::event sbmv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, double alpha, + const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies) { + sbmv_precondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + auto done = onemkl::mklgpu::sbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, + incy, dependencies); + sbmv_postcondition(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + return done; +} + +template <> +cl::sycl::event asum( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + float *result, const cl::sycl::vector_class &dependencies) { + asum_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklgpu::asum(queue, n, x, incx, result, dependencies); + asum_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event asum( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + double *result, const cl::sycl::vector_class &dependencies) { + asum_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklgpu::asum(queue, n, x, incx, result, dependencies); + asum_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event asum( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies) { + asum_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklgpu::asum(queue, n, x, incx, result, dependencies); + asum_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event asum( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies) { + asum_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklgpu::asum(queue, n, x, incx, result, dependencies); + asum_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event tbsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = onemkl::mklgpu::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, + dependencies); + tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tbsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = onemkl::mklgpu::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, + dependencies); + tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tbsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies) { + tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = onemkl::mklgpu::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, + dependencies); + tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event tbsv( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies) { + tbsv_precondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + auto done = onemkl::mklgpu::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, + dependencies); + tbsv_postcondition(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); + return done; +} + +template <> +cl::sycl::event spr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, float *a, + const cl::sycl::vector_class &dependencies) { + spr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + auto done = + onemkl::mklgpu::spr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + spr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + return done; +} + +template <> +cl::sycl::event spr2( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *x, + std::int64_t incx, const double *y, std::int64_t incy, double *a, + const cl::sycl::vector_class &dependencies) { + spr2_precondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + auto done = + onemkl::mklgpu::spr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + spr2_postcondition(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); + return done; +} + +template <> +cl::sycl::event iamax( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { + iamax_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklgpu::iamax(queue, n, x, incx, result, dependencies); + iamax_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event iamax( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies) { + iamax_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklgpu::iamax(queue, n, x, incx, result, dependencies); + iamax_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event iamax( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies) { + iamax_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklgpu::iamax(queue, n, x, incx, result, dependencies); + iamax_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event iamax( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies) { + iamax_precondition(queue, n, x, incx, result, dependencies); + auto done = onemkl::mklgpu::iamax(queue, n, x, incx, result, dependencies); + iamax_postcondition(queue, n, x, incx, result, dependencies); + return done; +} + +template <> +cl::sycl::event rotm( + cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, float *param, const cl::sycl::vector_class &dependencies) { + rotm_precondition(queue, n, x, incx, y, incy, param, dependencies); + auto done = onemkl::mklgpu::rotm(queue, n, x, incx, y, incy, param, dependencies); + rotm_postcondition(queue, n, x, incx, y, incy, param, dependencies); + return done; +} + +template <> +cl::sycl::event rotm( + cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, + std::int64_t incy, double *param, const cl::sycl::vector_class &dependencies) { + rotm_precondition(queue, n, x, incx, y, incy, param, dependencies); + auto done = onemkl::mklgpu::rotm(queue, n, x, incx, y, incy, param, dependencies); + rotm_postcondition(queue, n, x, incx, y, incy, param, dependencies); + return done; +} + +template <> +cl::sycl::event rotg( + cl::sycl::queue &queue, float *a, float *b, float *c, float *s, + const cl::sycl::vector_class &dependencies) { + rotg_precondition(queue, a, b, c, s, dependencies); + auto done = onemkl::mklgpu::rotg(queue, a, b, c, s, dependencies); + rotg_postcondition(queue, a, b, c, s, dependencies); + return done; +} + +template <> +cl::sycl::event rotg( + cl::sycl::queue &queue, double *a, double *b, double *c, double *s, + const cl::sycl::vector_class &dependencies) { + rotg_precondition(queue, a, b, c, s, dependencies); + auto done = onemkl::mklgpu::rotg(queue, a, b, c, s, dependencies); + rotg_postcondition(queue, a, b, c, s, dependencies); + return done; +} + +template <> +cl::sycl::event rotg( + cl::sycl::queue &queue, std::complex *a, std::complex *b, float *c, + std::complex *s, const cl::sycl::vector_class &dependencies) { + rotg_precondition(queue, a, b, c, s, dependencies); + auto done = onemkl::mklgpu::rotg(queue, a, b, c, s, dependencies); + rotg_postcondition(queue, a, b, c, s, dependencies); + return done; +} + +template <> +cl::sycl::event rotg( + cl::sycl::queue &queue, std::complex *a, std::complex *b, double *c, + std::complex *s, const cl::sycl::vector_class &dependencies) { + rotg_precondition(queue, a, b, c, s, dependencies); + auto done = onemkl::mklgpu::rotg(queue, a, b, c, s, dependencies); + rotg_postcondition(queue, a, b, c, s, dependencies); + return done; +} + +template <> +cl::sycl::event sdsdot( + cl::sycl::queue &queue, std::int64_t n, float sb, const float *x, std::int64_t incx, + const float *y, std::int64_t incy, float *result, + const cl::sycl::vector_class &dependencies) { + sdsdot_precondition(queue, n, sb, x, incx, y, incy, result, dependencies); + auto done = onemkl::mklgpu::sdsdot(queue, n, sb, x, incx, y, incy, result, dependencies); + sdsdot_postcondition(queue, n, sb, x, incx, y, incy, result, dependencies); + return done; +} + +template <> +cl::sycl::event her2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, float beta, std::complex *c, + std::int64_t ldc, const cl::sycl::vector_class &dependencies) { + her2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklgpu::her2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, + c, ldc, dependencies); + her2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event her2k( + cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, double beta, std::complex *c, + std::int64_t ldc, const cl::sycl::vector_class &dependencies) { + her2k_precondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + auto done = onemkl::mklgpu::her2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, + c, ldc, dependencies); + her2k_postcondition(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + dependencies); + return done; +} + +template <> +cl::sycl::event dot( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, const float *y, + std::int64_t incy, float *result, const cl::sycl::vector_class &dependencies) { + dot_precondition(queue, n, x, incx, y, incy, result, dependencies); + auto done = onemkl::mklgpu::dot(queue, n, x, incx, y, incy, result, dependencies); + dot_postcondition(queue, n, x, incx, y, incy, result, dependencies); + return done; +} + +template <> +cl::sycl::event dot( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, const double *y, + std::int64_t incy, double *result, + const cl::sycl::vector_class &dependencies) { + dot_precondition(queue, n, x, incx, y, incy, result, dependencies); + auto done = onemkl::mklgpu::dot(queue, n, x, incx, y, incy, result, dependencies); + dot_postcondition(queue, n, x, incx, y, incy, result, dependencies); + return done; +} + +template <> +cl::sycl::event dot( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, const float *y, + std::int64_t incy, double *result, + const cl::sycl::vector_class &dependencies) { + dot_precondition(queue, n, x, incx, y, incy, result, dependencies); + auto done = onemkl::mklgpu::dot(queue, n, x, incx, y, incy, result, dependencies); + dot_postcondition(queue, n, x, incx, y, incy, result, dependencies); + return done; +} + +template <> +cl::sycl::event symv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, const float *a, + std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + symv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = onemkl::mklgpu::symv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + symv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + +template <> +cl::sycl::event symv( + cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, const double *a, + std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + symv_precondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + auto done = onemkl::mklgpu::symv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, + dependencies); + symv_postcondition(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); + return done; +} + } //namespace blas } //namespace onemkl diff --git a/include/onemkl/blas/detail/mklgpu/onemkl_blas_mklgpu.hpp b/include/onemkl/blas/detail/mklgpu/onemkl_blas_mklgpu.hpp index 67a05eb99..431b0e2dc 100644 --- a/include/onemkl/blas/detail/mklgpu/onemkl_blas_mklgpu.hpp +++ b/include/onemkl/blas/detail/mklgpu/onemkl_blas_mklgpu.hpp @@ -30,7 +30,7 @@ namespace onemkl { namespace mklgpu { -// Level 3 +// Buffer APIs ONEMKL_EXPORT void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, @@ -217,8 +217,6 @@ ONEMKL_EXPORT void trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl: cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &b, std::int64_t ldb); -// Level 2 - ONEMKL_EXPORT void gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n, float alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx, @@ -558,8 +556,6 @@ ONEMKL_EXPORT void trsv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &x, std::int64_t incx); -// Level 1 - ONEMKL_EXPORT void dotc(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy, @@ -766,48 +762,6 @@ ONEMKL_EXPORT void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy); -// Batch API - -ONEMKL_EXPORT void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, cl::sycl::buffer &beta, - cl::sycl::buffer &c, cl::sycl::buffer &ldc, std::int64_t group_count, - cl::sycl::buffer &group_size); - -ONEMKL_EXPORT void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, cl::sycl::buffer &beta, - cl::sycl::buffer &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size); - -ONEMKL_EXPORT void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size); - -ONEMKL_EXPORT void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size); - ONEMKL_EXPORT void gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer &a, @@ -842,44 +796,6 @@ ONEMKL_EXPORT void gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa, cl::sycl::buffer, 1> &c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, - cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, - cl::sycl::buffer &m, - cl::sycl::buffer &n, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, std::int64_t group_count, - cl::sycl::buffer &group_size); - -ONEMKL_EXPORT void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size); - -ONEMKL_EXPORT void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size); - -ONEMKL_EXPORT void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size); - ONEMKL_EXPORT void trsm_batch(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, float alpha, @@ -910,8 +826,6 @@ ONEMKL_EXPORT void trsm_batch(cl::sycl::queue &queue, onemkl::side left_right, std::int64_t stride_a, cl::sycl::buffer, 1> &b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); -// BLAS like - ONEMKL_EXPORT void gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa, onemkl::transpose transb, std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer &a, std::int64_t lda, @@ -986,6 +900,902 @@ ONEMKL_EXPORT void gemm_ext(cl::sycl::queue &queue, onemkl::transpose transa, std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, half beta, cl::sycl::buffer &c, std::int64_t ldc); +// USM APIs + +ONEMKL_EXPORT cl::sycl::event gemm( + cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, + std::int64_t ldb, float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemm( + cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, double alpha, const double *a, std::int64_t lda, + const double *b, std::int64_t ldb, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemm( + cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemm( + cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event symm( + cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m, + std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, + float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event symm( + cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m, + std::int64_t n, double alpha, const double *a, std::int64_t lda, const double *b, + std::int64_t ldb, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event symm( + cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event symm( + cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event hemm( + cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event hemm( + cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event syrk( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n, + std::int64_t k, float alpha, const float *a, std::int64_t lda, float beta, float *c, + std::int64_t ldc, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event syrk( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n, + std::int64_t k, double alpha, const double *a, std::int64_t lda, double beta, double *c, + std::int64_t ldc, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event syrk( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event syrk( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event herk( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n, + std::int64_t k, float alpha, const std::complex *a, std::int64_t lda, float beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event herk( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n, + std::int64_t k, double alpha, const std::complex *a, std::int64_t lda, double beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event syr2k( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n, + std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, std::int64_t ldb, + float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event syr2k( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n, + std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *b, + std::int64_t ldb, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event syr2k( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event syr2k( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event her2k( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, float beta, std::complex *c, + std::int64_t ldc, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event her2k( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, double beta, std::complex *c, + std::int64_t ldc, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trmm( + cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, float alpha, + const float *a, std::int64_t lda, float *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trmm( + cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, double alpha, + const double *a, std::int64_t lda, double *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trmm( + cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trmm( + cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trsm( + cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, float alpha, + const float *a, std::int64_t lda, float *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trsm( + cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, double alpha, + const double *a, std::int64_t lda, double *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trsm( + cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trsm( + cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemv( + cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n, float alpha, + const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemv( + cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n, double alpha, + const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemv( + cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemv( + cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gbmv( + cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, float alpha, const float *a, std::int64_t lda, const float *x, + std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gbmv( + cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, double alpha, const double *a, std::int64_t lda, + const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gbmv( + cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gbmv( + cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + float alpha, const float *x, std::int64_t incx, const float *y, + std::int64_t incy, float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + double alpha, const double *x, std::int64_t incx, const double *y, + std::int64_t incy, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gerc( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gerc( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event geru( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event geru( + cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event hbmv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event hbmv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event hemv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event hemv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *x, + std::int64_t incx, std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event her(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, + float alpha, const std::complex *x, std::int64_t incx, + std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event her(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, + double alpha, const std::complex *x, std::int64_t incx, + std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event her2( + cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event her2( + cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event hpmv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *a, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event hpmv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *a, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event hpr(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, + float alpha, const std::complex *x, std::int64_t incx, + std::complex *a, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event hpr(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, + double alpha, const std::complex *x, std::int64_t incx, + std::complex *a, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event hpr2( + cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event hpr2( + cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event sbmv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, std::int64_t k, float alpha, + const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event sbmv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, std::int64_t k, double alpha, + const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event symv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, float alpha, const float *a, + std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event symv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, double alpha, const double *a, + std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event syr(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, + float alpha, const float *x, std::int64_t incx, float *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event syr(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, + double alpha, const double *x, std::int64_t incx, double *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event syr2( + cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, float alpha, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event syr2( + cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, double alpha, const double *x, + std::int64_t incx, const double *y, std::int64_t incy, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event spmv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, float alpha, const float *a, + const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event spmv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, double alpha, const double *a, + const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event spr(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, + float alpha, const float *x, std::int64_t incx, float *a, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event spr(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, + double alpha, const double *x, std::int64_t incx, double *a, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event spr2( + cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, float alpha, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, float *a, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event spr2( + cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, double alpha, const double *x, + std::int64_t incx, const double *y, std::int64_t incy, double *a, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tbmv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, std::int64_t k, const float *a, std::int64_t lda, + float *x, std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tbmv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, std::int64_t k, const double *a, std::int64_t lda, + double *x, std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tbmv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tbmv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tbsv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, std::int64_t k, const float *a, std::int64_t lda, + float *x, std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tbsv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, std::int64_t k, const double *a, std::int64_t lda, + double *x, std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tbsv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tbsv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tpmv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tpmv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tpmv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const std::complex *a, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tpmv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const std::complex *a, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tpsv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tpsv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tpsv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const std::complex *a, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event tpsv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const std::complex *a, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trmv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trmv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trmv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const std::complex *a, std::int64_t lda, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trmv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const std::complex *a, std::int64_t lda, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trsv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trsv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trsv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const std::complex *a, std::int64_t lda, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event trsv( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const std::complex *a, std::int64_t lda, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event dotc( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event dotc( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event dotu( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event dotu( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event iamax( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event iamax( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event iamax( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event iamax( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event iamin( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event iamin( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event iamin( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event iamin( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::int64_t *result, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event asum( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + float *result, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event asum( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + double *result, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event asum( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event asum( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event axpy( + cl::sycl::queue &queue, std::int64_t n, float alpha, const float *x, std::int64_t incx, + float *y, std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event axpy( + cl::sycl::queue &queue, std::int64_t n, double alpha, const double *x, std::int64_t incx, + double *y, std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event axpy( + cl::sycl::queue &queue, std::int64_t n, std::complex alpha, const std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event axpy( + cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event axpy_batch( + cl::sycl::queue &queue, std::int64_t *n, float *alpha, const float **x, std::int64_t *incx, + float **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event axpy_batch( + cl::sycl::queue &queue, std::int64_t *n, double *alpha, const double **x, std::int64_t *incx, + double **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event axpy_batch( + cl::sycl::queue &queue, std::int64_t *n, std::complex *alpha, + const std::complex **x, std::int64_t *incx, std::complex **y, std::int64_t *incy, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event axpy_batch( + cl::sycl::queue &queue, std::int64_t *n, std::complex *alpha, + const std::complex **x, std::int64_t *incx, std::complex **y, + std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event copy( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event copy( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event copy( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event copy( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, + float *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const double *x, + std::int64_t incx, const double *y, std::int64_t incy, + double *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event sdsdot( + cl::sycl::queue &queue, std::int64_t n, float sb, const float *x, std::int64_t incx, + const float *y, std::int64_t incy, float *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, + double *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event nrm2( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + float *result, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event nrm2( + cl::sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, + double *result, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event nrm2( + cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event nrm2( + cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + float c, float s, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + double c, double s, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, float *x, + std::int64_t incx, float *y, std::int64_t incy, float c, float s, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, double *x, + std::int64_t incx, double *y, std::int64_t incy, double c, + double s, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event rotg( + cl::sycl::queue &queue, float *a, float *b, float *c, float *s, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event rotg( + cl::sycl::queue &queue, double *a, double *b, double *c, double *s, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event rotg( + cl::sycl::queue &queue, std::complex *a, std::complex *b, float *c, + std::complex *s, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event rotg( + cl::sycl::queue &queue, std::complex *a, std::complex *b, double *c, + std::complex *s, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event rotm( + cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, float *param, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event rotm( + cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, + std::int64_t incy, double *param, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event rotmg( + cl::sycl::queue &queue, float *d1, float *d2, float *x1, float y1, float *param, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event rotmg( + cl::sycl::queue &queue, double *d1, double *d2, double *x1, double y1, double *param, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, float alpha, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, double alpha, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, std::complex alpha, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, std::complex alpha, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, float alpha, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event scal( + cl::sycl::queue &queue, std::int64_t n, double alpha, std::complex *x, + std::int64_t incx, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event swap( + cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event swap( + cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, + std::int64_t incy, const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event swap( + cl::sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event swap( + cl::sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemm_batch( + cl::sycl::queue &queue, onemkl::transpose *transa, onemkl::transpose *transb, std::int64_t *m, + std::int64_t *n, std::int64_t *k, float *alpha, const float **a, std::int64_t *lda, + const float **b, std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemm_batch( + cl::sycl::queue &queue, onemkl::transpose *transa, onemkl::transpose *transb, std::int64_t *m, + std::int64_t *n, std::int64_t *k, double *alpha, const double **a, std::int64_t *lda, + const double **b, std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemm_batch( + cl::sycl::queue &queue, onemkl::transpose *transa, onemkl::transpose *transb, std::int64_t *m, + std::int64_t *n, std::int64_t *k, std::complex *alpha, const std::complex **a, + std::int64_t *lda, const std::complex **b, std::int64_t *ldb, std::complex *beta, + std::complex **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemm_batch( + cl::sycl::queue &queue, onemkl::transpose *transa, onemkl::transpose *transb, std::int64_t *m, + std::int64_t *n, std::int64_t *k, std::complex *alpha, const std::complex **a, + std::int64_t *lda, const std::complex **b, std::int64_t *ldb, + std::complex *beta, std::complex **c, std::int64_t *ldc, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemm_batch( + cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, + std::int64_t stride_a, const float *b, std::int64_t ldb, std::int64_t stride_b, float beta, + float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemm_batch( + cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, double alpha, const double *a, std::int64_t lda, + std::int64_t stride_a, const double *b, std::int64_t ldb, std::int64_t stride_b, double beta, + double *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemm_batch( + cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex alpha, const std::complex *a, + std::int64_t lda, std::int64_t stride_a, const std::complex *b, std::int64_t ldb, + std::int64_t stride_b, std::complex beta, std::complex *c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemm_batch( + cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex alpha, const std::complex *a, + std::int64_t lda, std::int64_t stride_a, const std::complex *b, std::int64_t ldb, + std::int64_t stride_b, std::complex beta, std::complex *c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemmt( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa, + onemkl::transpose transb, std::int64_t n, std::int64_t k, float alpha, const float *a, + std::int64_t lda, const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemmt( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa, + onemkl::transpose transb, std::int64_t n, std::int64_t k, double alpha, const double *a, + std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemmt( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa, + onemkl::transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +ONEMKL_EXPORT cl::sycl::event gemmt( + cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa, + onemkl::transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + } //namespace mklgpu } //namespace onemkl diff --git a/include/onemkl/blas/predicates.hpp b/include/onemkl/blas/predicates.hpp index d485bd1c6..b16e8153e 100644 --- a/include/onemkl/blas/predicates.hpp +++ b/include/onemkl/blas/predicates.hpp @@ -30,6 +30,8 @@ namespace onemkl { namespace blas { +// Buffer APIs + inline void herk_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, @@ -323,118 +325,6 @@ inline void spr_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int #endif } -inline void gemm_batch_precondition( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, cl::sycl::buffer &beta, - cl::sycl::buffer &c, cl::sycl::buffer &ldc, std::int64_t group_count, - cl::sycl::buffer &group_size) { -#ifndef ONEMKL_DISABLE_PREDICATES - /* add prechecks to queue here for input args. */ -#endif -} - -inline void gemm_batch_postcondition( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, cl::sycl::buffer &beta, - cl::sycl::buffer &c, cl::sycl::buffer &ldc, std::int64_t group_count, - cl::sycl::buffer &group_size) { -#ifndef ONEMKL_DISABLE_PREDICATES - /* add postchecks to queue here for input args. */ -#endif -} - -inline void gemm_batch_precondition( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, cl::sycl::buffer &beta, - cl::sycl::buffer &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size) { -#ifndef ONEMKL_DISABLE_PREDICATES - /* add prechecks to queue here for input args. */ -#endif -} - -inline void gemm_batch_postcondition( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, cl::sycl::buffer &beta, - cl::sycl::buffer &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size) { -#ifndef ONEMKL_DISABLE_PREDICATES - /* add postchecks to queue here for input args. */ -#endif -} - -inline void gemm_batch_precondition( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size) { -#ifndef ONEMKL_DISABLE_PREDICATES - /* add prechecks to queue here for input args. */ -#endif -} - -inline void gemm_batch_postcondition( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size) { -#ifndef ONEMKL_DISABLE_PREDICATES - /* add postchecks to queue here for input args. */ -#endif -} - -inline void gemm_batch_precondition( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size) { -#ifndef ONEMKL_DISABLE_PREDICATES - /* add prechecks to queue here for input args. */ -#endif -} - -inline void gemm_batch_postcondition( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size) { -#ifndef ONEMKL_DISABLE_PREDICATES - /* add postchecks to queue here for input args. */ -#endif -} - inline void gemm_batch_precondition(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer &a, std::int64_t lda, @@ -2996,110 +2886,6 @@ inline void spr2_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::in #endif } -inline void trsm_batch_precondition( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, std::int64_t group_count, - cl::sycl::buffer &group_size) { -#ifndef ONEMKL_DISABLE_PREDICATES - /* add prechecks to queue here for input args. */ -#endif -} - -inline void trsm_batch_postcondition( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, std::int64_t group_count, - cl::sycl::buffer &group_size) { -#ifndef ONEMKL_DISABLE_PREDICATES - /* add postchecks to queue here for input args. */ -#endif -} - -inline void trsm_batch_precondition( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { -#ifndef ONEMKL_DISABLE_PREDICATES - /* add prechecks to queue here for input args. */ -#endif -} - -inline void trsm_batch_postcondition( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { -#ifndef ONEMKL_DISABLE_PREDICATES - /* add postchecks to queue here for input args. */ -#endif -} - -inline void trsm_batch_precondition( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { -#ifndef ONEMKL_DISABLE_PREDICATES - /* add prechecks to queue here for input args. */ -#endif -} - -inline void trsm_batch_postcondition( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { -#ifndef ONEMKL_DISABLE_PREDICATES - /* add postchecks to queue here for input args. */ -#endif -} - -inline void trsm_batch_precondition( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { -#ifndef ONEMKL_DISABLE_PREDICATES - /* add prechecks to queue here for input args. */ -#endif -} - -inline void trsm_batch_postcondition( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { -#ifndef ONEMKL_DISABLE_PREDICATES - /* add postchecks to queue here for input args. */ -#endif -} - inline void trsm_batch_precondition(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, cl::sycl::buffer &a, std::int64_t lda, @@ -3406,6 +3192,3143 @@ inline void rotg_postcondition(cl::sycl::queue &queue, cl::sycl::buffer *a, std::int64_t lda, float beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void herk_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, + const std::complex *a, std::int64_t lda, float beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void herk_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, + const std::complex *a, std::int64_t lda, double beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void herk_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, + const std::complex *a, std::int64_t lda, double beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void scal_precondition(cl::sycl::queue &queue, std::int64_t n, float alpha, float *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void scal_postcondition(cl::sycl::queue &queue, std::int64_t n, float alpha, float *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void scal_precondition(cl::sycl::queue &queue, std::int64_t n, double alpha, double *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void scal_postcondition(cl::sycl::queue &queue, std::int64_t n, double alpha, double *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void scal_precondition(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void scal_postcondition(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void scal_precondition(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void scal_postcondition(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void scal_precondition(cl::sycl::queue &queue, std::int64_t n, float alpha, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void scal_postcondition(cl::sycl::queue &queue, std::int64_t n, float alpha, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void scal_precondition(cl::sycl::queue &queue, std::int64_t n, double alpha, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void scal_postcondition(cl::sycl::queue &queue, std::int64_t n, double alpha, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void trmv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, + float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void trmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, + float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void trmv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, + double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void trmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, + double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void trmv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void trmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void trmv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void trmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void tpmv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const float *a, float *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void tpmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const float *a, float *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void tpmv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const double *a, double *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void tpmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const double *a, double *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void tpmv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex *a, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void tpmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex *a, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void tpmv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex *a, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void tpmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex *a, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void spr_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const float *x, std::int64_t incx, float *a, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void spr_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const float *x, std::int64_t incx, float *a, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void spr_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + const double *x, std::int64_t incx, double *a, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void spr_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + double alpha, const double *x, std::int64_t incx, double *a, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void gemm_batch_precondition(cl::sycl::queue &queue, transpose *transa, transpose *transb, + std::int64_t *m, std::int64_t *n, std::int64_t *k, float *alpha, + const float **a, std::int64_t *lda, const float **b, + std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void gemm_batch_postcondition(cl::sycl::queue &queue, transpose *transa, transpose *transb, + std::int64_t *m, std::int64_t *n, std::int64_t *k, + float *alpha, const float **a, std::int64_t *lda, + const float **b, std::int64_t *ldb, float *beta, float **c, + std::int64_t *ldc, std::int64_t group_count, + std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void gemm_batch_precondition(cl::sycl::queue &queue, transpose *transa, transpose *transb, + std::int64_t *m, std::int64_t *n, std::int64_t *k, + double *alpha, const double **a, std::int64_t *lda, + const double **b, std::int64_t *ldb, double *beta, double **c, + std::int64_t *ldc, std::int64_t group_count, + std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void gemm_batch_postcondition(cl::sycl::queue &queue, transpose *transa, transpose *transb, + std::int64_t *m, std::int64_t *n, std::int64_t *k, + double *alpha, const double **a, std::int64_t *lda, + const double **b, std::int64_t *ldb, double *beta, double **c, + std::int64_t *ldc, std::int64_t group_count, + std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void gemm_batch_precondition(cl::sycl::queue &queue, transpose *transa, transpose *transb, + std::int64_t *m, std::int64_t *n, std::int64_t *k, + std::complex *alpha, const std::complex **a, + std::int64_t *lda, const std::complex **b, + std::int64_t *ldb, std::complex *beta, + std::complex **c, std::int64_t *ldc, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void gemm_batch_postcondition(cl::sycl::queue &queue, transpose *transa, transpose *transb, + std::int64_t *m, std::int64_t *n, std::int64_t *k, + std::complex *alpha, const std::complex **a, + std::int64_t *lda, const std::complex **b, + std::int64_t *ldb, std::complex *beta, + std::complex **c, std::int64_t *ldc, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void gemm_batch_precondition(cl::sycl::queue &queue, transpose *transa, transpose *transb, + std::int64_t *m, std::int64_t *n, std::int64_t *k, + std::complex *alpha, const std::complex **a, + std::int64_t *lda, const std::complex **b, + std::int64_t *ldb, std::complex *beta, + std::complex **c, std::int64_t *ldc, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void gemm_batch_postcondition(cl::sycl::queue &queue, transpose *transa, transpose *transb, + std::int64_t *m, std::int64_t *n, std::int64_t *k, + std::complex *alpha, const std::complex **a, + std::int64_t *lda, const std::complex **b, + std::int64_t *ldb, std::complex *beta, + std::complex **c, std::int64_t *ldc, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void gemm_batch_precondition(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + const float *a, std::int64_t lda, std::int64_t stride_a, + const float *b, std::int64_t ldb, std::int64_t stride_b, + float beta, float *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void gemm_batch_postcondition(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + const float *a, std::int64_t lda, std::int64_t stride_a, + const float *b, std::int64_t ldb, std::int64_t stride_b, + float beta, float *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void gemm_batch_precondition(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, + const double *a, std::int64_t lda, std::int64_t stride_a, + const double *b, std::int64_t ldb, std::int64_t stride_b, + double beta, double *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void gemm_batch_postcondition(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, + const double *a, std::int64_t lda, std::int64_t stride_a, + const double *b, std::int64_t ldb, std::int64_t stride_b, + double beta, double *c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void gemm_batch_precondition( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + std::int64_t stride_a, const std::complex *b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, std::complex *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void gemm_batch_postcondition( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + std::int64_t stride_a, const std::complex *b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, std::complex *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void gemm_batch_precondition( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + std::int64_t stride_a, const std::complex *b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, std::complex *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void gemm_batch_postcondition( + cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, + std::int64_t stride_a, const std::complex *b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, std::complex *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void syrk_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, const float *a, + std::int64_t lda, float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void syrk_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, const float *a, + std::int64_t lda, float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void syrk_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, const double *a, + std::int64_t lda, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void syrk_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, const double *a, + std::int64_t lda, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void syrk_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void syrk_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void syrk_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void syrk_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void her2_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void her2_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void her2_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void her2_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void hbmv_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void hbmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void hbmv_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void hbmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void rot_precondition(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, float c, + float s, const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void rot_postcondition(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, float c, + float s, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void rot_precondition(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + double c, double s, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void rot_postcondition(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + double c, double s, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void rot_precondition(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, + float *y, std::int64_t incy, float c, float s, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void rot_postcondition(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, + float *y, std::int64_t incy, float c, float s, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void rot_precondition(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, + double *y, std::int64_t incy, double c, double s, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void rot_postcondition(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, + double *y, std::int64_t incy, double c, double s, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void axpy_precondition(cl::sycl::queue &queue, std::int64_t n, float alpha, const float *x, + std::int64_t incx, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void axpy_postcondition(cl::sycl::queue &queue, std::int64_t n, float alpha, const float *x, + std::int64_t incx, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void axpy_precondition(cl::sycl::queue &queue, std::int64_t n, double alpha, const double *x, + std::int64_t incx, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void axpy_postcondition(cl::sycl::queue &queue, std::int64_t n, double alpha, + const double *x, std::int64_t incx, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void axpy_precondition(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void axpy_postcondition(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void axpy_precondition(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void axpy_postcondition(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void axpy_batch_precondition(cl::sycl::queue &queue, std::int64_t *n, float *alpha, + const float **x, std::int64_t *incx, float **y, + std::int64_t *incy, std::int64_t group_count, + std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void axpy_batch_postcondition(cl::sycl::queue &queue, std::int64_t *n, float *alpha, + const float **x, std::int64_t *incx, float **y, + std::int64_t *incy, std::int64_t group_count, + std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void axpy_batch_precondition(cl::sycl::queue &queue, std::int64_t *n, double *alpha, + const double **x, std::int64_t *incx, double **y, + std::int64_t *incy, std::int64_t group_count, + std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void axpy_batch_postcondition(cl::sycl::queue &queue, std::int64_t *n, double *alpha, + const double **x, std::int64_t *incx, double **y, + std::int64_t *incy, std::int64_t group_count, + std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void axpy_batch_precondition(cl::sycl::queue &queue, std::int64_t *n, + std::complex *alpha, const std::complex **x, + std::int64_t *incx, std::complex **y, std::int64_t *incy, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void axpy_batch_postcondition(cl::sycl::queue &queue, std::int64_t *n, + std::complex *alpha, const std::complex **x, + std::int64_t *incx, std::complex **y, + std::int64_t *incy, std::int64_t group_count, + std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void axpy_batch_precondition(cl::sycl::queue &queue, std::int64_t *n, + std::complex *alpha, const std::complex **x, + std::int64_t *incx, std::complex **y, + std::int64_t *incy, std::int64_t group_count, + std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void axpy_batch_postcondition(cl::sycl::queue &queue, std::int64_t *n, + std::complex *alpha, const std::complex **x, + std::int64_t *incx, std::complex **y, + std::int64_t *incy, std::int64_t group_count, + std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void gerc_precondition(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void gerc_postcondition(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void gerc_precondition(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void gerc_postcondition(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void syr2k_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, const float *a, + std::int64_t lda, const float *b, std::int64_t ldb, float beta, + float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void syr2k_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, const float *a, + std::int64_t lda, const float *b, std::int64_t ldb, float beta, + float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void syr2k_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, const double *a, + std::int64_t lda, const double *b, std::int64_t ldb, double beta, + double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void syr2k_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, const double *a, + std::int64_t lda, const double *b, std::int64_t ldb, double beta, + double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void syr2k_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void syr2k_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void syr2k_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void syr2k_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void gemv_precondition(cl::sycl::queue &queue, transpose trans, std::int64_t m, + std::int64_t n, float alpha, const float *a, std::int64_t lda, + const float *x, std::int64_t incx, float beta, float *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void gemv_postcondition(cl::sycl::queue &queue, transpose trans, std::int64_t m, + std::int64_t n, float alpha, const float *a, std::int64_t lda, + const float *x, std::int64_t incx, float beta, float *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void gemv_precondition(cl::sycl::queue &queue, transpose trans, std::int64_t m, + std::int64_t n, double alpha, const double *a, std::int64_t lda, + const double *x, std::int64_t incx, double beta, double *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void gemv_postcondition(cl::sycl::queue &queue, transpose trans, std::int64_t m, + std::int64_t n, double alpha, const double *a, std::int64_t lda, + const double *x, std::int64_t incx, double beta, double *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void gemv_precondition(cl::sycl::queue &queue, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void gemv_postcondition(cl::sycl::queue &queue, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void gemv_precondition(cl::sycl::queue &queue, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void gemv_postcondition(cl::sycl::queue &queue, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void her_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const std::complex *x, std::int64_t incx, + std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void her_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const std::complex *x, std::int64_t incx, + std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void her_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + const std::complex *x, std::int64_t incx, + std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void her_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + double alpha, const std::complex *x, std::int64_t incx, + std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void hpr_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const std::complex *x, std::int64_t incx, + std::complex *a, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void hpr_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const std::complex *x, std::int64_t incx, + std::complex *a, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void hpr_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + const std::complex *x, std::int64_t incx, + std::complex *a, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void hpr_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + double alpha, const std::complex *x, std::int64_t incx, + std::complex *a, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void iamin_precondition(cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void iamin_postcondition(cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void iamin_precondition(cl::sycl::queue &queue, std::int64_t n, const double *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void iamin_postcondition(cl::sycl::queue &queue, std::int64_t n, const double *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void iamin_precondition(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void iamin_postcondition(cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, + std::int64_t *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void iamin_precondition(cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, + std::int64_t *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void iamin_postcondition(cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, + std::int64_t *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void hpmv_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *a, + const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void hpmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *a, + const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void hpmv_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *a, + const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void hpmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *a, + const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void spmv_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const float *a, const float *x, std::int64_t incx, float beta, + float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void spmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + float alpha, const float *a, const float *x, std::int64_t incx, + float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void spmv_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + double alpha, const double *a, const double *x, std::int64_t incx, + double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void spmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + double alpha, const double *a, const double *x, std::int64_t incx, + double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void rotmg_precondition(cl::sycl::queue &queue, float *d1, float *d2, float *x1, float y1, + float *param, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void rotmg_postcondition(cl::sycl::queue &queue, float *d1, float *d2, float *x1, float y1, + float *param, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void rotmg_precondition(cl::sycl::queue &queue, double *d1, double *d2, double *x1, + double y1, double *param, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void rotmg_postcondition(cl::sycl::queue &queue, double *d1, double *d2, double *x1, + double y1, double *param, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void swap_precondition(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, + float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void swap_postcondition(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, + float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void swap_precondition(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, + double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void swap_postcondition(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, + double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void swap_precondition(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void swap_postcondition(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void swap_precondition(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void swap_postcondition(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void geru_precondition(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void geru_postcondition(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void geru_precondition(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void geru_postcondition(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void nrm2_precondition(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void nrm2_postcondition(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void nrm2_precondition(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void nrm2_postcondition(cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void nrm2_precondition(cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void nrm2_postcondition(cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void nrm2_precondition(cl::sycl::queue &queue, std::int64_t n, const double *x, + std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void nrm2_postcondition(cl::sycl::queue &queue, std::int64_t n, const double *x, + std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void gemmt_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, float alpha, + const float *a, std::int64_t lda, const float *b, std::int64_t ldb, + float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void gemmt_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, float alpha, + const float *a, std::int64_t lda, const float *b, std::int64_t ldb, + float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void gemmt_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, double alpha, + const double *a, std::int64_t lda, const double *b, std::int64_t ldb, + double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void gemmt_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, double alpha, + const double *a, std::int64_t lda, const double *b, + std::int64_t ldb, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void gemmt_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void gemmt_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void gemmt_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void gemmt_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void gemm_precondition(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + const float *a, std::int64_t lda, const float *b, std::int64_t ldb, + float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void gemm_postcondition(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + const float *a, std::int64_t lda, const float *b, std::int64_t ldb, + float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void gemm_precondition(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, + const double *a, std::int64_t lda, const double *b, std::int64_t ldb, + double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void gemm_postcondition(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, + const double *a, std::int64_t lda, const double *b, std::int64_t ldb, + double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void gemm_precondition(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void gemm_postcondition(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void gemm_precondition(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void gemm_postcondition(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void gemm_precondition(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, half alpha, + const half *a, std::int64_t lda, const half *b, std::int64_t ldb, + half beta, half *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void gemm_postcondition(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, half alpha, + const half *a, std::int64_t lda, const half *b, std::int64_t ldb, + half beta, half *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void syr2_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const float *x, std::int64_t incx, const float *y, std::int64_t incy, + float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void syr2_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + float alpha, const float *x, std::int64_t incx, const float *y, + std::int64_t incy, float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void syr2_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + double alpha, const double *x, std::int64_t incx, const double *y, + std::int64_t incy, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void syr2_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + double alpha, const double *x, std::int64_t incx, const double *y, + std::int64_t incy, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void ger_precondition(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, + const float *x, std::int64_t incx, const float *y, std::int64_t incy, + float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void ger_postcondition(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, + const float *x, std::int64_t incx, const float *y, std::int64_t incy, + float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void ger_precondition(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, + const double *x, std::int64_t incx, const double *y, std::int64_t incy, + double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void ger_postcondition(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, + const double *x, std::int64_t incx, const double *y, + std::int64_t incy, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void trsm_precondition(cl::sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + float alpha, const float *a, std::int64_t lda, float *b, + std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void trsm_postcondition(cl::sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + float alpha, const float *a, std::int64_t lda, float *b, + std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void trsm_precondition(cl::sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + double alpha, const double *a, std::int64_t lda, double *b, + std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void trsm_postcondition(cl::sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + double alpha, const double *a, std::int64_t lda, double *b, + std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void trsm_precondition(cl::sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void trsm_postcondition(cl::sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void trsm_precondition(cl::sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void trsm_postcondition(cl::sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void dotu_precondition(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void dotu_postcondition(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void dotu_precondition(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void dotu_postcondition(cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void hemm_precondition(cl::sycl::queue &queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void hemm_postcondition(cl::sycl::queue &queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void hemm_precondition(cl::sycl::queue &queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void hemm_postcondition(cl::sycl::queue &queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void hpr2_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *a, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void hpr2_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *a, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void hpr2_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *a, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void hpr2_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *a, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void gbmv_precondition(cl::sycl::queue &queue, transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, + const float *a, std::int64_t lda, const float *x, std::int64_t incx, + float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void gbmv_postcondition(cl::sycl::queue &queue, transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, + const float *a, std::int64_t lda, const float *x, std::int64_t incx, + float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void gbmv_precondition(cl::sycl::queue &queue, transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, + const double *a, std::int64_t lda, const double *x, std::int64_t incx, + double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void gbmv_postcondition(cl::sycl::queue &queue, transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, + const double *a, std::int64_t lda, const double *x, + std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void gbmv_precondition(cl::sycl::queue &queue, transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, + std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void gbmv_postcondition(cl::sycl::queue &queue, transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, + std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void gbmv_precondition(cl::sycl::queue &queue, transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, + std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void gbmv_postcondition(cl::sycl::queue &queue, transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, + std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void tbmv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const float *a, + std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void tbmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const float *a, + std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void tbmv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const double *a, + std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void tbmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const double *a, + std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void tbmv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, + const std::complex *a, std::int64_t lda, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void tbmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, + const std::complex *a, std::int64_t lda, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void tbmv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, + const std::complex *a, std::int64_t lda, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void tbmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, + const std::complex *a, std::int64_t lda, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void symm_precondition(cl::sycl::queue &queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, float alpha, const float *a, + std::int64_t lda, const float *b, std::int64_t ldb, float beta, + float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void symm_postcondition(cl::sycl::queue &queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, float alpha, const float *a, + std::int64_t lda, const float *b, std::int64_t ldb, float beta, + float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void symm_precondition(cl::sycl::queue &queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, double alpha, const double *a, + std::int64_t lda, const double *b, std::int64_t ldb, double beta, + double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void symm_postcondition(cl::sycl::queue &queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, double alpha, const double *a, + std::int64_t lda, const double *b, std::int64_t ldb, double beta, + double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void symm_precondition(cl::sycl::queue &queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void symm_postcondition(cl::sycl::queue &queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void symm_precondition(cl::sycl::queue &queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void symm_postcondition(cl::sycl::queue &queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void dotc_precondition(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void dotc_postcondition(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void dotc_precondition(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void dotc_postcondition(cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void syr_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const float *x, std::int64_t incx, float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void syr_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const float *x, std::int64_t incx, float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void syr_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, + const double *x, std::int64_t incx, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void syr_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + double alpha, const double *x, std::int64_t incx, double *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void trmm_precondition(cl::sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + float alpha, const float *a, std::int64_t lda, float *b, + std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void trmm_postcondition(cl::sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + float alpha, const float *a, std::int64_t lda, float *b, + std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void trmm_precondition(cl::sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + double alpha, const double *a, std::int64_t lda, double *b, + std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void trmm_postcondition(cl::sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + double alpha, const double *a, std::int64_t lda, double *b, + std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void trmm_precondition(cl::sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void trmm_postcondition(cl::sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void trmm_precondition(cl::sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void trmm_postcondition(cl::sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void symv_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const float *a, std::int64_t lda, const float *x, std::int64_t incx, + float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void symv_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + float alpha, const float *a, std::int64_t lda, const float *x, + std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void symv_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + double alpha, const double *a, std::int64_t lda, const double *x, + std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void symv_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + double alpha, const double *a, std::int64_t lda, const double *x, + std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void tpsv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const float *a, float *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void tpsv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const float *a, float *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void tpsv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const double *a, double *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void tpsv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const double *a, double *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void tpsv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex *a, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void tpsv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex *a, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void tpsv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex *a, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void tpsv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex *a, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void trsv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, + float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void trsv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, + float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void trsv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, + double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void trsv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, + double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void trsv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void trsv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void trsv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void trsv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void copy_precondition(cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void copy_postcondition(cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void copy_precondition(cl::sycl::queue &queue, std::int64_t n, const double *x, + std::int64_t incx, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void copy_postcondition(cl::sycl::queue &queue, std::int64_t n, const double *x, + std::int64_t incx, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void copy_precondition(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void copy_postcondition(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void copy_precondition(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void copy_postcondition(cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void hemv_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void hemv_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void hemv_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void hemv_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void iamax_precondition(cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void iamax_postcondition(cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void iamax_precondition(cl::sycl::queue &queue, std::int64_t n, const double *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void iamax_postcondition(cl::sycl::queue &queue, std::int64_t n, const double *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void iamax_precondition(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void iamax_postcondition(cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, + std::int64_t *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void iamax_precondition(cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, + std::int64_t *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void iamax_postcondition(cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, + std::int64_t *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void sbmv_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::int64_t k, float alpha, const float *a, std::int64_t lda, + const float *x, std::int64_t incx, float beta, float *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void sbmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::int64_t k, float alpha, const float *a, std::int64_t lda, + const float *x, std::int64_t incx, float beta, float *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void sbmv_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::int64_t k, double alpha, const double *a, std::int64_t lda, + const double *x, std::int64_t incx, double beta, double *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void sbmv_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::int64_t k, double alpha, const double *a, std::int64_t lda, + const double *x, std::int64_t incx, double beta, double *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void asum_precondition(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void asum_postcondition(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void asum_precondition(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void asum_postcondition(cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void asum_precondition(cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void asum_postcondition(cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void asum_precondition(cl::sycl::queue &queue, std::int64_t n, const double *x, + std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void asum_postcondition(cl::sycl::queue &queue, std::int64_t n, const double *x, + std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void tbsv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const float *a, + std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void tbsv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const float *a, + std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void tbsv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const double *a, + std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void tbsv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const double *a, + std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void tbsv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, + const std::complex *a, std::int64_t lda, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void tbsv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, + const std::complex *a, std::int64_t lda, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void tbsv_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, + const std::complex *a, std::int64_t lda, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void tbsv_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, + const std::complex *a, std::int64_t lda, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void spr2_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, + const float *x, std::int64_t incx, const float *y, std::int64_t incy, + float *a, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void spr2_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + float alpha, const float *x, std::int64_t incx, const float *y, + std::int64_t incy, float *a, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void spr2_precondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + double alpha, const double *x, std::int64_t incx, const double *y, + std::int64_t incy, double *a, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void spr2_postcondition(cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + double alpha, const double *x, std::int64_t incx, const double *y, + std::int64_t incy, double *a, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void rotm_precondition(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, + float *y, std::int64_t incy, float *param, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void rotm_postcondition(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, + float *y, std::int64_t incy, float *param, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void rotm_precondition(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, + double *y, std::int64_t incy, double *param, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void rotm_postcondition(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, + double *y, std::int64_t incy, double *param, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void dot_precondition(cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, float *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void dot_postcondition(cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, float *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void dot_precondition(cl::sycl::queue &queue, std::int64_t n, const double *x, + std::int64_t incx, const double *y, std::int64_t incy, double *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void dot_postcondition(cl::sycl::queue &queue, std::int64_t n, const double *x, + std::int64_t incx, const double *y, std::int64_t incy, double *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void dot_precondition(cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, double *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void dot_postcondition(cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, double *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void sdsdot_precondition(cl::sycl::queue &queue, std::int64_t n, float sb, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, float *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void sdsdot_postcondition(cl::sycl::queue &queue, std::int64_t n, float sb, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, + float *result, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void her2k_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, float beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void her2k_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, float beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void her2k_precondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, double beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void her2k_postcondition(cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, double beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void rotg_precondition(cl::sycl::queue &queue, float *a, float *b, float *c, float *s, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void rotg_postcondition(cl::sycl::queue &queue, float *a, float *b, float *c, float *s, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void rotg_precondition(cl::sycl::queue &queue, double *a, double *b, double *c, double *s, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void rotg_postcondition(cl::sycl::queue &queue, double *a, double *b, double *c, double *s, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void rotg_precondition(cl::sycl::queue &queue, std::complex *a, + std::complex *b, float *c, std::complex *s, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void rotg_postcondition(cl::sycl::queue &queue, std::complex *a, + std::complex *b, float *c, std::complex *s, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + +inline void rotg_precondition(cl::sycl::queue &queue, std::complex *a, + std::complex *b, double *c, std::complex *s, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add prechecks to queue here for input args. */ +#endif +} + +inline void rotg_postcondition(cl::sycl::queue &queue, std::complex *a, + std::complex *b, double *c, std::complex *s, + const cl::sycl::vector_class &dependencies) { +#ifndef ONEMKL_DISABLE_PREDICATES + /* add postchecks to queue here for input args. */ +#endif +} + } //namespace blas } //namespace onemkl diff --git a/src/blas/backends/cublas/cublas_batch.cpp b/src/blas/backends/cublas/cublas_batch.cpp index e04b86b62..39a710048 100644 --- a/src/blas/backends/cublas/cublas_batch.cpp +++ b/src/blas/backends/cublas/cublas_batch.cpp @@ -16,64 +16,21 @@ * limitations under the License. * **************************************************************************/ -#include +#include "cublas_helper.hpp" +#include "include/exceptions_helper.hpp" #include "onemkl/blas/detail/cublas/onemkl_blas_cublas.hpp" namespace onemkl { namespace cublas { -void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, cl::sycl::buffer &beta, - cl::sycl::buffer &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size) { - throw std::runtime_error("Not implemented for cublas"); -} - -void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, cl::sycl::buffer &beta, - cl::sycl::buffer &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size) { - throw std::runtime_error("Not implemented for cublas"); -} - -void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size) { - throw std::runtime_error("Not implemented for cublas"); -} - -void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size) { - throw std::runtime_error("Not implemented for cublas"); -} +// Buffer APIs void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, cl::sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, float beta, cl::sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { - throw std::runtime_error("Not implemented for cublas"); + throw backend_unsupported_exception(); } void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, @@ -82,7 +39,7 @@ void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, std: std::int64_t ldb, std::int64_t stride_b, double beta, cl::sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { - throw std::runtime_error("Not implemented for cublas"); + throw backend_unsupported_exception(); } void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, @@ -92,7 +49,7 @@ void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, std: std::int64_t ldb, std::int64_t stride_b, std::complex beta, cl::sycl::buffer, 1> &c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { - throw std::runtime_error("Not implemented for cublas"); + throw backend_unsupported_exception(); } void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, @@ -102,51 +59,7 @@ void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, std: std::int64_t ldb, std::int64_t stride_b, std::complex beta, cl::sycl::buffer, 1> &c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { - throw std::runtime_error("Not implemented for cublas"); -} - -void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { - throw std::runtime_error("Not implemented for cublas"); -} - -void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { - throw std::runtime_error("Not implemented for cublas"); -} - -void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, - cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { - throw std::runtime_error("Not implemented for cublas"); -} - -void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, - cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, std::int64_t group_count, - cl::sycl::buffer &group_size) { - throw std::runtime_error("Not implemented for cublas"); + throw backend_unsupported_exception(); } void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, @@ -154,7 +67,7 @@ void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, trans cl::sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, cl::sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { - throw std::runtime_error("Not implemented for cublas"); + throw backend_unsupported_exception(); } void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, @@ -162,7 +75,7 @@ void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, trans cl::sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, cl::sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { - throw std::runtime_error("Not implemented for cublas"); + throw backend_unsupported_exception(); } void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, @@ -170,7 +83,7 @@ void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, trans cl::sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, cl::sycl::buffer, 1> &b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { - throw std::runtime_error("Not implemented for cublas"); + throw backend_unsupported_exception(); } void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, @@ -178,7 +91,111 @@ void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, trans cl::sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, cl::sycl::buffer, 1> &b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { - throw std::runtime_error("Not implemented for cublas"); + throw backend_unsupported_exception(); +} + +// USM APIs + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, + int64_t *n, int64_t *k, float *alpha, const float **a, int64_t *lda, + const float **b, int64_t *ldb, float *beta, float **c, int64_t *ldc, + int64_t group_count, int64_t *groupsize, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, + int64_t *n, int64_t *k, double *alpha, const double **a, int64_t *lda, + const double **b, int64_t *ldb, double *beta, double **c, int64_t *ldc, + int64_t group_count, int64_t *groupsize, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, + int64_t *n, int64_t *k, std::complex *alpha, + const std::complex **a, int64_t *lda, + const std::complex **b, int64_t *ldb, std::complex *beta, + std::complex **c, int64_t *ldc, int64_t group_count, + int64_t *groupsize, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, + int64_t *n, int64_t *k, std::complex *alpha, + const std::complex **a, int64_t *lda, + const std::complex **b, int64_t *ldb, std::complex *beta, + std::complex **c, int64_t *ldc, int64_t group_count, + int64_t *groupsize, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m, + int64_t n, int64_t k, float alpha, const float *a, int64_t lda, + int64_t stride_a, const float *b, int64_t ldb, int64_t stride_b, + float beta, float *c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m, + int64_t n, int64_t k, double alpha, const double *a, int64_t lda, + int64_t stride_a, const double *b, int64_t ldb, int64_t stride_b, + double beta, double *c, int64_t ldc, int64_t stride_c, + int64_t batch_size, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); } + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m, + int64_t n, int64_t k, std::complex alpha, + const std::complex *a, int64_t lda, int64_t stride_a, + const std::complex *b, int64_t ldb, int64_t stride_b, + std::complex beta, std::complex *c, int64_t ldc, + int64_t stride_c, int64_t batch_size, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m, + int64_t n, int64_t k, std::complex alpha, + const std::complex *a, int64_t lda, int64_t stride_a, + const std::complex *b, int64_t ldb, int64_t stride_b, + std::complex beta, std::complex *c, int64_t ldc, + int64_t stride_c, int64_t batch_size, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +cl::sycl::event axpy_batch(cl::sycl::queue &queue, int64_t *n, float *alpha, const float **x, + int64_t *incx, float **y, int64_t *incy, int64_t group_count, + int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +cl::sycl::event axpy_batch(cl::sycl::queue &queue, int64_t *n, double *alpha, const double **x, + int64_t *incx, double **y, int64_t *incy, int64_t group_count, + int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +cl::sycl::event axpy_batch(cl::sycl::queue &queue, int64_t *n, std::complex *alpha, + const std::complex **x, int64_t *incx, std::complex **y, + int64_t *incy, int64_t group_count, int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +cl::sycl::event axpy_batch(cl::sycl::queue &queue, int64_t *n, std::complex *alpha, + const std::complex **x, int64_t *incx, std::complex **y, + int64_t *incy, int64_t group_count, int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + } // namespace cublas } // namespace onemkl diff --git a/src/blas/backends/cublas/cublas_extensions.cpp b/src/blas/backends/cublas/cublas_extensions.cpp index 6d17b34e5..b75ce5e5c 100644 --- a/src/blas/backends/cublas/cublas_extensions.cpp +++ b/src/blas/backends/cublas/cublas_extensions.cpp @@ -16,26 +16,29 @@ * limitations under the License. * **************************************************************************/ -#include +#include "cublas_helper.hpp" +#include "include/exceptions_helper.hpp" #include "onemkl/blas/detail/cublas/onemkl_blas_cublas.hpp" namespace onemkl { namespace cublas { +// Buffer APIs + // BLAS-like extensions void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, float beta, cl::sycl::buffer &c, std::int64_t ldc) { - throw std::runtime_error("Not implemented for cublas"); + throw backend_unsupported_exception(); } void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, double alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, double beta, cl::sycl::buffer &c, std::int64_t ldc) { - throw std::runtime_error("Not implemented for cublas"); + throw backend_unsupported_exception(); } void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, @@ -43,7 +46,7 @@ void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, cl::sycl::buffer, 1> &c, std::int64_t ldc) { - throw std::runtime_error("Not implemented for cublas"); + throw backend_unsupported_exception(); } void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, @@ -52,14 +55,14 @@ void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose cl::sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, cl::sycl::buffer, 1> &c, std::int64_t ldc) { - throw std::runtime_error("Not implemented for cublas"); + throw backend_unsupported_exception(); } void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, float beta, cl::sycl::buffer &c, std::int64_t ldc) { - throw std::runtime_error("Not implemented for cublas"); + throw backend_unsupported_exception(); } void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, offset offsetc, @@ -67,21 +70,21 @@ void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, offset cl::sycl::buffer &a, std::int64_t lda, int8_t ao, cl::sycl::buffer &b, std::int64_t ldb, uint8_t bo, float beta, cl::sycl::buffer &c, std::int64_t ldc, cl::sycl::buffer &co) { - throw std::runtime_error("Not implemented for cublas"); + throw backend_unsupported_exception(); } void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, float beta, cl::sycl::buffer &c, std::int64_t ldc) { - throw std::runtime_error("Not implemented for cublas"); + throw backend_unsupported_exception(); } void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, double alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, double beta, cl::sycl::buffer &c, std::int64_t ldc) { - throw std::runtime_error("Not implemented for cublas"); + throw backend_unsupported_exception(); } void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, @@ -90,7 +93,7 @@ void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, std::i cl::sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, cl::sycl::buffer, 1> &c, std::int64_t ldc) { - throw std::runtime_error("Not implemented for cublas"); + throw backend_unsupported_exception(); } void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, @@ -99,14 +102,48 @@ void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, std::i cl::sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, cl::sycl::buffer, 1> &c, std::int64_t ldc) { - throw std::runtime_error("Not implemented for cublas"); + throw backend_unsupported_exception(); } void gemm_ext(cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, half alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, half beta, cl::sycl::buffer &c, std::int64_t ldc) { - throw std::runtime_error("Not implemented for cublas"); + throw backend_unsupported_exception(); +} + +// USM APIs + +// BLAS-like extensions + +cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, float alpha, const float *a, int64_t lda, + const float *b, int64_t ldb, float beta, float *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, double alpha, const double *a, int64_t lda, + const double *b, int64_t ldb, double beta, double *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, std::complex alpha, const std::complex *a, + int64_t lda, const std::complex *b, int64_t ldb, + std::complex beta, std::complex *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, std::complex alpha, + const std::complex *a, int64_t lda, const std::complex *b, + int64_t ldb, std::complex beta, std::complex *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); } } // namespace cublas diff --git a/src/blas/backends/cublas/cublas_helper.hpp b/src/blas/backends/cublas/cublas_helper.hpp index 1304acdba..2ae92c56a 100644 --- a/src/blas/backends/cublas/cublas_helper.hpp +++ b/src/blas/backends/cublas/cublas_helper.hpp @@ -26,8 +26,8 @@ #include #include #include -#include #include "onemkl/types.hpp" + namespace onemkl { namespace cublas { diff --git a/src/blas/backends/cublas/cublas_level1.cpp b/src/blas/backends/cublas/cublas_level1.cpp index e8bf9f937..866624f4f 100644 --- a/src/blas/backends/cublas/cublas_level1.cpp +++ b/src/blas/backends/cublas/cublas_level1.cpp @@ -18,12 +18,16 @@ **************************************************************************/ #include "cublas_helper.hpp" #include "cublas_scope_handle.hpp" +#include "include/exceptions_helper.hpp" #include "onemkl/blas/detail/cublas/onemkl_blas_cublas.hpp" #include namespace onemkl { namespace cublas { + +// Buffer APIs + // Level 1 template inline void asum(Func func, cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, @@ -602,5 +606,269 @@ NRM2_LAUNCHER(std::complex, float, cublasScnrm2) NRM2_LAUNCHER(std::complex, double, cublasDznrm2) #undef NRM2_LAUNCHER +// USM APIs + +// Level 1 +template +inline cl::sycl::event asum(Func func, cl::sycl::queue &queue, int64_t n, const T1 *x, + const int64_t incx, T2 *result, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define ASUM_LAUNCHER_USM(TYPE1, TYPE2, CUBLAS_ROUTINE) \ + cl::sycl::event asum(cl::sycl::queue &queue, int64_t n, const TYPE1 *x, const int64_t incx, \ + TYPE2 *result, \ + const cl::sycl::vector_class &dependencies) { \ + return asum(CUBLAS_ROUTINE, queue, n, x, incx, result, dependencies); \ + } +ASUM_LAUNCHER_USM(float, float, cublasSasum) +ASUM_LAUNCHER_USM(double, double, cublasDasum) +ASUM_LAUNCHER_USM(std::complex, float, cublasScasum) +ASUM_LAUNCHER_USM(std::complex, double, cublasDzasum) +#undef ASUM_LAUNCHER_USM + +template +inline cl::sycl::event scal(Func func, cl::sycl::queue &queue, int64_t n, T1 a, T2 *x, int64_t incx, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define SCAL_LAUNCHER_USM(TYPE1, TYPE2, CUBLAS_ROUTINE) \ + cl::sycl::event scal(cl::sycl::queue &queue, int64_t n, TYPE1 a, TYPE2 *x, int64_t incx, \ + const cl::sycl::vector_class &dependencies) { \ + return scal(CUBLAS_ROUTINE, queue, n, a, x, incx, dependencies); \ + } +SCAL_LAUNCHER_USM(float, float, cublasSscal) +SCAL_LAUNCHER_USM(double, double, cublasDscal) +SCAL_LAUNCHER_USM(std::complex, std::complex, cublasCscal) +SCAL_LAUNCHER_USM(std::complex, std::complex, cublasZscal) +SCAL_LAUNCHER_USM(float, std::complex, cublasCsscal) +SCAL_LAUNCHER_USM(double, std::complex, cublasZdscal) +#undef SCAL_LAUNCHER_USM + +template +inline cl::sycl::event axpy(Func func, cl::sycl::queue &queue, int64_t n, T alpha, const T *x, + int64_t incx, T *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define AXPY_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event axpy(cl::sycl::queue &queue, int64_t n, TYPE alpha, const TYPE *x, \ + int64_t incx, TYPE *y, int64_t incy, \ + const cl::sycl::vector_class &dependencies) { \ + return axpy(CUBLAS_ROUTINE, queue, n, alpha, x, incx, y, incy, dependencies); \ + } + +AXPY_LAUNCHER_USM(float, cublasSaxpy) +AXPY_LAUNCHER_USM(double, cublasDaxpy) +AXPY_LAUNCHER_USM(std::complex, cublasCaxpy) +AXPY_LAUNCHER_USM(std::complex, cublasZaxpy) +#undef AXPY_LAUNCHER_USM + +template +inline cl::sycl::event rotg(Func func, cl::sycl::queue &queue, T1 *a, T1 *b, T2 *c, T1 *s, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define ROTG_LAUNCHER_USM(TYPE1, TYPE2, CUBLAS_ROUTINE) \ + cl::sycl::event rotg(cl::sycl::queue &queue, TYPE1 *a, TYPE1 *b, TYPE2 *c, TYPE1 *s, \ + const cl::sycl::vector_class &dependencies) { \ + return rotg(CUBLAS_ROUTINE, queue, a, b, c, s, dependencies); \ + } + +ROTG_LAUNCHER_USM(float, float, cublasSrotg) +ROTG_LAUNCHER_USM(double, double, cublasDrotg) +ROTG_LAUNCHER_USM(std::complex, float, cublasCrotg) +ROTG_LAUNCHER_USM(std::complex, double, cublasZrotg) +#undef ROTG_LAUNCHER_USM + +template +inline cl::sycl::event rotm(Func func, cl::sycl::queue &queue, int64_t n, T *x, int64_t incx, T *y, + int64_t incy, T *param, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define ROTM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event rotm(cl::sycl::queue &queue, int64_t n, TYPE *x, int64_t incx, TYPE *y, \ + int64_t incy, TYPE *param, \ + const cl::sycl::vector_class &dependencies) { \ + return rotm(CUBLAS_ROUTINE, queue, n, x, incx, y, incy, param, dependencies); \ + } + +ROTM_LAUNCHER_USM(float, cublasSrotm) +ROTM_LAUNCHER_USM(double, cublasDrotm) +#undef ROTM_LAUNCHER_USM + +template +inline cl::sycl::event copy(Func func, cl::sycl::queue &queue, int64_t n, const T *x, int64_t incx, + T *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define COPY_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event copy(cl::sycl::queue &queue, int64_t n, const TYPE *x, int64_t incx, TYPE *y, \ + int64_t incy, \ + const cl::sycl::vector_class &dependencies) { \ + return copy(CUBLAS_ROUTINE, queue, n, x, incx, y, incy, dependencies); \ + } + +COPY_LAUNCHER_USM(float, cublasScopy) +COPY_LAUNCHER_USM(double, cublasDcopy) +COPY_LAUNCHER_USM(std::complex, cublasCcopy) +COPY_LAUNCHER_USM(std::complex, cublasZcopy) +#undef COPY_LAUNCHER_USM + +template +inline cl::sycl::event dot(Func func, cl::sycl::queue &queue, int64_t n, const T *x, + const int64_t incx, const T *y, int64_t incy, T *result, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define DOT_LAUNCHER_USM(EXT, TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event dot##EXT(cl::sycl::queue &queue, int64_t n, const TYPE *x, const int64_t incx, \ + const TYPE *y, const int64_t incy, TYPE *result, \ + const cl::sycl::vector_class &dependencies) { \ + return dot(CUBLAS_ROUTINE, queue, n, x, incx, y, incy, result, dependencies); \ + } +DOT_LAUNCHER_USM(, float, cublasSdot) +DOT_LAUNCHER_USM(, double, cublasDdot) +DOT_LAUNCHER_USM(c, std::complex, cublasCdotc) +DOT_LAUNCHER_USM(c, std::complex, cublasZdotc) +DOT_LAUNCHER_USM(u, std::complex, cublasCdotu) +DOT_LAUNCHER_USM(u, std::complex, cublasZdotu) +#undef DOT_LAUNCHER_USM + +template +inline cl::sycl::event rot(Func func, cl::sycl::queue &queue, int64_t n, T1 *x, const int64_t incx, + T1 *y, int64_t incy, T2 c, T3 s, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define ROT_LAUNCHER_USM(TYPE1, TYPE2, TYPE3, CUBLAS_ROUTINE) \ + cl::sycl::event rot(cl::sycl::queue &queue, int64_t n, TYPE1 *x, const int64_t incx, TYPE1 *y, \ + int64_t incy, TYPE2 c, TYPE3 s, \ + const cl::sycl::vector_class &dependencies) { \ + return rot(CUBLAS_ROUTINE, queue, n, x, incx, y, incy, c, s, dependencies); \ + } + +ROT_LAUNCHER_USM(float, float, float, cublasSrot) +ROT_LAUNCHER_USM(double, double, double, cublasDrot) +ROT_LAUNCHER_USM(std::complex, float, float, cublasCsrot) +ROT_LAUNCHER_USM(std::complex, double, double, cublasZdrot) +#undef ROT_LAUNCHER_USM + +cl::sycl::event sdsdot(cl::sycl::queue &queue, int64_t n, float sb, const float *x, int64_t incx, + const float *y, int64_t incy, float *result, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +cl::sycl::event dot(cl::sycl::queue &queue, int64_t n, const float *x, int64_t incx, const float *y, + int64_t incy, double *result, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +template +inline cl::sycl::event rotmg(Func func, cl::sycl::queue &queue, T *d1, T *d2, T *x1, T y1, T *param, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define ROTMG_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event rotmg(cl::sycl::queue &queue, TYPE *d1, TYPE *d2, TYPE *x1, TYPE y1, \ + TYPE *param, \ + const cl::sycl::vector_class &dependencies) { \ + return rotmg(CUBLAS_ROUTINE, queue, d1, d2, x1, y1, param, dependencies); \ + } + +ROTMG_LAUNCHER_USM(float, cublasSrotmg) +ROTMG_LAUNCHER_USM(double, cublasDrotmg) +#undef ROTMG_LAUNCHER_USM + +template +inline cl::sycl::event iamax(Func func, cl::sycl::queue &queue, int64_t n, const T *x, + const int64_t incx, int64_t *result, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define IAMAX_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event iamax(cl::sycl::queue &queue, int64_t n, const TYPE *x, const int64_t incx, \ + int64_t *result, \ + const cl::sycl::vector_class &dependencies) { \ + return iamax(CUBLAS_ROUTINE, queue, n, x, incx, result, dependencies); \ + } +IAMAX_LAUNCHER_USM(float, cublasIsamax) +IAMAX_LAUNCHER_USM(double, cublasIdamax) +IAMAX_LAUNCHER_USM(std::complex, cublasIcamax) +IAMAX_LAUNCHER_USM(std::complex, cublasIzamax) +#undef IAMAX_LAUNCHER_USM + +template +inline cl::sycl::event swap(Func func, cl::sycl::queue &queue, int64_t n, T *x, int64_t incx, T *y, + int64_t incy, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define SWAP_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event swap(cl::sycl::queue &queue, int64_t n, TYPE *x, int64_t incx, TYPE *y, \ + int64_t incy, \ + const cl::sycl::vector_class &dependencies) { \ + return swap(CUBLAS_ROUTINE, queue, n, x, incx, y, incy, dependencies); \ + } + +SWAP_LAUNCHER_USM(float, cublasSswap) +SWAP_LAUNCHER_USM(double, cublasDswap) +SWAP_LAUNCHER_USM(std::complex, cublasCswap) +SWAP_LAUNCHER_USM(std::complex, cublasZswap) +#undef SWAP_LAUNCHER_USM + +template +inline cl::sycl::event iamin(Func func, cl::sycl::queue &queue, int64_t n, const T *x, + const int64_t incx, int64_t *result, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define IAMIN_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event iamin(cl::sycl::queue &queue, int64_t n, const TYPE *x, const int64_t incx, \ + int64_t *result, \ + const cl::sycl::vector_class &dependencies) { \ + return iamin(CUBLAS_ROUTINE, queue, n, x, incx, result, dependencies); \ + } +IAMIN_LAUNCHER_USM(float, cublasIsamin) +IAMIN_LAUNCHER_USM(double, cublasIdamin) +IAMIN_LAUNCHER_USM(std::complex, cublasIcamin) +IAMIN_LAUNCHER_USM(std::complex, cublasIzamin) +#undef IAMIN_LAUNCHER_USM + +template +inline cl::sycl::event nrm2(Func func, cl::sycl::queue &queue, int64_t n, const T1 *x, + const int64_t incx, T2 *result, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define NRM2_LAUNCHER_USM(TYPE1, TYPE2, CUBLAS_ROUTINE) \ + cl::sycl::event nrm2(cl::sycl::queue &queue, int64_t n, const TYPE1 *x, const int64_t incx, \ + TYPE2 *result, \ + const cl::sycl::vector_class &dependencies) { \ + return nrm2(CUBLAS_ROUTINE, queue, n, x, incx, result, dependencies); \ + } +NRM2_LAUNCHER_USM(float, float, cublasSnrm2) +NRM2_LAUNCHER_USM(double, double, cublasDnrm2) +NRM2_LAUNCHER_USM(std::complex, float, cublasScnrm2) +NRM2_LAUNCHER_USM(std::complex, double, cublasDznrm2) +#undef NRM2_LAUNCHER_USM + } // namespace cublas } // namespace onemkl diff --git a/src/blas/backends/cublas/cublas_level2.cpp b/src/blas/backends/cublas/cublas_level2.cpp index 25fda3ba5..69989c4c5 100644 --- a/src/blas/backends/cublas/cublas_level2.cpp +++ b/src/blas/backends/cublas/cublas_level2.cpp @@ -19,10 +19,14 @@ #include #include "cublas_helper.hpp" #include "cublas_scope_handle.hpp" +#include "include/exceptions_helper.hpp" #include "onemkl/blas/detail/cublas/onemkl_blas_cublas.hpp" namespace onemkl { namespace cublas { + +// Buffer APIs + template inline void gemv(Func func, cl::sycl::queue &queue, transpose trans, int64_t m, int64_t n, T alpha, cl::sycl::buffer &a, int64_t lda, cl::sycl::buffer &x, int64_t incx, @@ -840,5 +844,506 @@ TRSV_LAUNCHER(std::complex, cublasZtrsv) #undef TRSV_LAUNCHER +// USM APIs + +template +inline cl::sycl::event gemv(Func func, cl::sycl::queue &queue, transpose trans, int64_t m, + int64_t n, T alpha, const T *a, int64_t lda, const T *x, int64_t incx, + T beta, T *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define GEMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event gemv(cl::sycl::queue &queue, transpose trans, int64_t m, int64_t n, \ + TYPE alpha, const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, \ + TYPE beta, TYPE *y, int64_t incy, \ + const cl::sycl::vector_class &dependencies) { \ + return gemv(CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, \ + dependencies); \ + } + +GEMV_LAUNCHER_USM(float, cublasSgemv) +GEMV_LAUNCHER_USM(double, cublasDgemv) +GEMV_LAUNCHER_USM(std::complex, cublasCgemv) +GEMV_LAUNCHER_USM(std::complex, cublasZgemv) +#undef GEMV_LAUNCHER_USM + +template +inline cl::sycl::event gbmv(Func func, cl::sycl::queue &queue, transpose trans, int64_t m, + int64_t n, int64_t kl, int64_t ku, T alpha, const T *a, int64_t lda, + const T *x, int64_t incx, T beta, T *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define GBMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event gbmv(cl::sycl::queue &queue, transpose trans, int64_t m, int64_t n, \ + int64_t kl, int64_t ku, TYPE alpha, const TYPE *a, int64_t lda, \ + const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy, \ + const cl::sycl::vector_class &dependencies) { \ + return gbmv(CUBLAS_ROUTINE, queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, \ + incy, dependencies); \ + } + +GBMV_LAUNCHER_USM(float, cublasSgbmv) +GBMV_LAUNCHER_USM(double, cublasDgbmv) +GBMV_LAUNCHER_USM(std::complex, cublasCgbmv) +GBMV_LAUNCHER_USM(std::complex, cublasZgbmv) +#undef GBMV_LAUNCHER_USM + +template +inline cl::sycl::event ger(Func func, cl::sycl::queue &queue, int64_t m, int64_t n, T alpha, + const T *x, int64_t incx, const T *y, int64_t incy, T *a, int64_t lda, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define GER_LAUNCHER_USM(EXT, TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event ger##EXT(cl::sycl::queue &queue, int64_t m, int64_t n, TYPE alpha, \ + const TYPE *x, int64_t incx, const TYPE *y, int64_t incy, TYPE *a, \ + int64_t lda, \ + const cl::sycl::vector_class &dependencies) { \ + return ger(CUBLAS_ROUTINE, queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); \ + } + +GER_LAUNCHER_USM(, float, cublasSger) +GER_LAUNCHER_USM(, double, cublasDger) +GER_LAUNCHER_USM(u, std::complex, cublasCgeru) +GER_LAUNCHER_USM(u, std::complex, cublasZgeru) +GER_LAUNCHER_USM(c, std::complex, cublasCgerc) +GER_LAUNCHER_USM(c, std::complex, cublasZgerc) +#undef GER_LAUNCHER_USM + +template +inline cl::sycl::event hbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, + int64_t k, T alpha, const T *a, int64_t lda, const T *x, int64_t incx, + T beta, T *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define HBMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event hbmv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, \ + TYPE alpha, const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, \ + TYPE beta, TYPE *y, int64_t incy, \ + const cl::sycl::vector_class &dependencies) { \ + return hbmv(CUBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, \ + incy, dependencies); \ + } + +HBMV_LAUNCHER_USM(std::complex, cublasChbmv) +HBMV_LAUNCHER_USM(std::complex, cublasZhbmv) +#undef HBMV_LAUNCHER_USM + +template +inline cl::sycl::event hemv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, + const T *a, int64_t lda, const T *x, int64_t incx, T beta, T *y, + int64_t incy, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define HEMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event hemv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ + const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, TYPE beta, \ + TYPE *y, int64_t incy, \ + const cl::sycl::vector_class &dependencies) { \ + return hemv(CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, \ + dependencies); \ + } + +HEMV_LAUNCHER_USM(std::complex, cublasChemv) +HEMV_LAUNCHER_USM(std::complex, cublasZhemv) +#undef HEMV_LAUNCHER_USM + +template +inline cl::sycl::event her(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, + ScalarType alpha, const DataType *x, int64_t incx, DataType *a, + int64_t lda, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define HER_LAUNCHER_USM(SCALAR_TYPE, DATA_TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event her(cl::sycl::queue &queue, uplo upper_lower, int64_t n, SCALAR_TYPE alpha, \ + const DATA_TYPE *x, int64_t incx, DATA_TYPE *a, int64_t lda, \ + const cl::sycl::vector_class &dependencies) { \ + return her(CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); \ + } + +HER_LAUNCHER_USM(float, std::complex, cublasCher) +HER_LAUNCHER_USM(double, std::complex, cublasZher) + +#undef HER_LAUNCHER_USM + +template +inline cl::sycl::event her2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, + const T *x, int64_t incx, const T *y, int64_t incy, T *a, int64_t lda, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define HER2_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event her2(cl::sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ + const TYPE *x, int64_t incx, const TYPE *y, int64_t incy, TYPE *a, \ + int64_t lda, \ + const cl::sycl::vector_class &dependencies) { \ + return her2(CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, \ + dependencies); \ + } + +HER2_LAUNCHER_USM(std::complex, cublasCher2) +HER2_LAUNCHER_USM(std::complex, cublasZher2) + +#undef HER2_LAUNCHER_USM + +template +inline cl::sycl::event hpmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, + const T *a, const T *x, int64_t incx, T beta, T *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define HPMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event hpmv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ + const TYPE *a, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, \ + int64_t incy, \ + const cl::sycl::vector_class &dependencies) { \ + return hpmv(CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, \ + dependencies); \ + } + +HPMV_LAUNCHER_USM(std::complex, cublasChpmv) +HPMV_LAUNCHER_USM(std::complex, cublasZhpmv) + +#undef HPMV_LAUNCHER_USM + +template +inline cl::sycl::event hpr(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, + ScalarType alpha, const DataType *x, int64_t incx, DataType *a, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define HPR_LAUNCHER_USM(SCALAR_TYPE, DATA_TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event hpr(cl::sycl::queue &queue, uplo upper_lower, int64_t n, SCALAR_TYPE alpha, \ + const DATA_TYPE *x, int64_t incx, DATA_TYPE *a, \ + const cl::sycl::vector_class &dependencies) { \ + return hpr(CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, dependencies); \ + } + +HPR_LAUNCHER_USM(float, std::complex, cublasChpr) +HPR_LAUNCHER_USM(double, std::complex, cublasZhpr) + +#undef HPR_LAUNCHER_USM + +template +inline cl::sycl::event hpr2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, + const T *x, int64_t incx, const T *y, int64_t incy, T *a, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define HPR2_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event hpr2(cl::sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ + const TYPE *x, int64_t incx, const TYPE *y, int64_t incy, TYPE *a, \ + const cl::sycl::vector_class &dependencies) { \ + return hpr2(CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, \ + dependencies); \ + } + +HPR2_LAUNCHER_USM(std::complex, cublasChpr2) +HPR2_LAUNCHER_USM(std::complex, cublasZhpr2) + +#undef HPR2_LAUNCHER_USM + +template +inline cl::sycl::event sbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, + int64_t k, T alpha, const T *a, int64_t lda, const T *x, int64_t incx, + T beta, T *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define SBMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event sbmv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, \ + TYPE alpha, const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, \ + TYPE beta, TYPE *y, int64_t incy, \ + const cl::sycl::vector_class &dependencies) { \ + return sbmv(CUBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, \ + incy, dependencies); \ + } + +SBMV_LAUNCHER_USM(float, cublasSsbmv) +SBMV_LAUNCHER_USM(double, cublasDsbmv) + +#undef SBMV_LAUNCHER_USM + +template +inline cl::sycl::event symv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, + const T *a, int64_t lda, const T *x, int64_t incx, T beta, T *y, + int64_t incy, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define SYMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event symv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ + const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, TYPE beta, \ + TYPE *y, int64_t incy, \ + const cl::sycl::vector_class &dependencies) { \ + return symv(CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, \ + dependencies); \ + } + +SYMV_LAUNCHER_USM(float, cublasSsymv) +SYMV_LAUNCHER_USM(double, cublasDsymv) + +#undef SYMV_LAUNCHER_USM + +template +inline cl::sycl::event syr(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, + const T *x, int64_t incx, T *a, int64_t lda, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define SYR_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event syr(cl::sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ + const TYPE *x, int64_t incx, TYPE *a, int64_t lda, \ + const cl::sycl::vector_class &dependencies) { \ + return syr(CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); \ + } + +SYR_LAUNCHER_USM(float, cublasSsyr) +SYR_LAUNCHER_USM(double, cublasDsyr) +// Intel does not support the following two +SYR_LAUNCHER_USM(std::complex, cublasCsyr) +SYR_LAUNCHER_USM(std::complex, cublasZsyr) +#undef SYR_LAUNCHER_USM + +template +inline cl::sycl::event syr2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, + const T *x, int64_t incx, const T *y, int64_t incy, T *a, int64_t lda, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define SYR2_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event syr2(cl::sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ + const TYPE *x, int64_t incx, const TYPE *y, int64_t incy, TYPE *a, \ + int64_t lda, \ + const cl::sycl::vector_class &dependencies) { \ + return syr2(CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, \ + dependencies); \ + } + +SYR2_LAUNCHER_USM(float, cublasSsyr2) +SYR2_LAUNCHER_USM(double, cublasDsyr2) +// Intel does not support the following two +SYR2_LAUNCHER_USM(std::complex, cublasCsyr2) +SYR2_LAUNCHER_USM(std::complex, cublasZsyr2) + +#undef SYR2_LAUNCHER_USM + +template +inline cl::sycl::event spmv(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, + const T *a, const T *x, int64_t incx, T beta, T *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define SPMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event spmv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ + const TYPE *a, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, \ + int64_t incy, \ + const cl::sycl::vector_class &dependencies) { \ + return spmv(CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, \ + dependencies); \ + } + +SPMV_LAUNCHER_USM(float, cublasSspmv) +SPMV_LAUNCHER_USM(double, cublasDspmv) + +#undef SPMV_LAUNCHER_USM + +template +inline cl::sycl::event spr(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, + const T *x, int64_t incx, T *a, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define SPR_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event spr(cl::sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ + const TYPE *x, int64_t incx, TYPE *a, \ + const cl::sycl::vector_class &dependencies) { \ + return spr(CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, dependencies); \ + } + +SPR_LAUNCHER_USM(float, cublasSspr) +SPR_LAUNCHER_USM(double, cublasDspr) + +#undef SPR_LAUNCHER_USM + +template +inline cl::sycl::event spr2(Func func, cl::sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, + const T *x, int64_t incx, const T *y, int64_t incy, T *a, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define SPR2_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event spr2(cl::sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ + const TYPE *x, int64_t incx, const TYPE *y, int64_t incy, TYPE *a, \ + const cl::sycl::vector_class &dependencies) { \ + return spr2(CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, \ + dependencies); \ + } + +SPR2_LAUNCHER_USM(float, cublasSspr2) +SPR2_LAUNCHER_USM(double, cublasDspr2) + +#undef SPR2_LAUNCHER_USM + +template +inline cl::sycl::event tbmv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, int64_t n, int64_t k, const T *a, int64_t lda, T *x, + int64_t incx, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define TBMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, \ + diag unit_diag, int64_t n, int64_t k, const TYPE *a, int64_t lda, \ + TYPE *x, int64_t incx, \ + const cl::sycl::vector_class &dependencies) { \ + return tbmv(CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, \ + dependencies); \ + } + +TBMV_LAUNCHER_USM(float, cublasStbmv) +TBMV_LAUNCHER_USM(double, cublasDtbmv) +TBMV_LAUNCHER_USM(std::complex, cublasCtbmv) +TBMV_LAUNCHER_USM(std::complex, cublasZtbmv) + +#undef TBMV_LAUNCHER_USM + +template +inline cl::sycl::event tbsv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, int64_t n, int64_t k, const T *a, int64_t lda, T *x, + int64_t incx, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define TBSV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, \ + diag unit_diag, int64_t n, int64_t k, const TYPE *a, int64_t lda, \ + TYPE *x, int64_t incx, \ + const cl::sycl::vector_class &dependencies) { \ + return tbsv(CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, \ + dependencies); \ + } + +TBSV_LAUNCHER_USM(float, cublasStbsv) +TBSV_LAUNCHER_USM(double, cublasDtbsv) +TBSV_LAUNCHER_USM(std::complex, cublasCtbsv) +TBSV_LAUNCHER_USM(std::complex, cublasZtbsv) + +#undef TBSV_LAUNCHER_USM + +template +inline cl::sycl::event tpmv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, int64_t n, const T *a, T *x, int64_t incx, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define TPMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, \ + diag unit_diag, int64_t n, const TYPE *a, TYPE *x, int64_t incx, \ + const cl::sycl::vector_class &dependencies) { \ + return tpmv(CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, x, incx, \ + dependencies); \ + } + +TPMV_LAUNCHER_USM(float, cublasStpmv) +TPMV_LAUNCHER_USM(double, cublasDtpmv) +TPMV_LAUNCHER_USM(std::complex, cublasCtpmv) +TPMV_LAUNCHER_USM(std::complex, cublasZtpmv) + +#undef TPMV_LAUNCHER_USM + +template +inline cl::sycl::event tpsv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, int64_t n, const T *a, T *x, int64_t incx, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define TPSV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, \ + diag unit_diag, int64_t n, const TYPE *a, TYPE *x, int64_t incx, \ + const cl::sycl::vector_class &dependencies) { \ + return tpsv(CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, x, incx, \ + dependencies); \ + } + +TPSV_LAUNCHER_USM(float, cublasStpsv) +TPSV_LAUNCHER_USM(double, cublasDtpsv) +TPSV_LAUNCHER_USM(std::complex, cublasCtpsv) +TPSV_LAUNCHER_USM(std::complex, cublasZtpsv) + +#undef TPSV_LAUNCHER_USM + +template +inline cl::sycl::event trmv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, int64_t n, const T *a, int64_t lda, T *x, int64_t incx, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define TRMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event trmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, \ + diag unit_diag, int64_t n, const TYPE *a, int64_t lda, TYPE *x, \ + int64_t incx, \ + const cl::sycl::vector_class &dependencies) { \ + return trmv(CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, \ + dependencies); \ + } + +TRMV_LAUNCHER_USM(float, cublasStrmv) +TRMV_LAUNCHER_USM(double, cublasDtrmv) +TRMV_LAUNCHER_USM(std::complex, cublasCtrmv) +TRMV_LAUNCHER_USM(std::complex, cublasZtrmv) + +#undef TRMV_LAUNCHER_USM + +template +inline cl::sycl::event trsv(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, int64_t n, const T *a, int64_t lda, T *x, int64_t incx, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define TRSV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, \ + diag unit_diag, int64_t n, const TYPE *a, int64_t lda, TYPE *x, \ + int64_t incx, \ + const cl::sycl::vector_class &dependencies) { \ + return trsv(CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, \ + dependencies); \ + } + +TRSV_LAUNCHER_USM(float, cublasStrsv) +TRSV_LAUNCHER_USM(double, cublasDtrsv) +TRSV_LAUNCHER_USM(std::complex, cublasCtrsv) +TRSV_LAUNCHER_USM(std::complex, cublasZtrsv) + +#undef TRSV_LAUNCHER_USM + } // namespace cublas } // namespace onemkl diff --git a/src/blas/backends/cublas/cublas_level3.cpp b/src/blas/backends/cublas/cublas_level3.cpp index 81f7b7a91..0bb0c911b 100644 --- a/src/blas/backends/cublas/cublas_level3.cpp +++ b/src/blas/backends/cublas/cublas_level3.cpp @@ -19,10 +19,14 @@ #include #include "cublas_helper.hpp" #include "cublas_scope_handle.hpp" +#include "include/exceptions_helper.hpp" #include "onemkl/blas/detail/cublas/onemkl_blas_cublas.hpp" namespace onemkl { namespace cublas { + +// Buffer APIs + template inline void gemm(Func func, cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, T alpha, cl::sycl::buffer &a, int64_t lda, @@ -375,5 +379,220 @@ TRSM_LAUNCHER(std::complex, cublasCtrsm) TRSM_LAUNCHER(std::complex, cublasZtrsm) #undef TRSM_LAUNCHER + +// USM APIs + +template +inline cl::sycl::event gemm(Func func, cl::sycl::queue &queue, transpose transa, transpose transb, + int64_t m, int64_t n, int64_t k, T alpha, const T *a, int64_t lda, + const T *b, int64_t ldb, T beta, T *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define GEMM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event gemm(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m, \ + int64_t n, int64_t k, TYPE alpha, const TYPE *a, int64_t lda, \ + const TYPE *b, int64_t ldb, TYPE beta, TYPE *c, int64_t ldc, \ + const cl::sycl::vector_class &dependencies) { \ + return gemm(CUBLAS_ROUTINE, queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, \ + c, ldc, dependencies); \ + } + +GEMM_LAUNCHER_USM(float, cublasSgemm) +GEMM_LAUNCHER_USM(double, cublasDgemm) +GEMM_LAUNCHER_USM(std::complex, cublasCgemm) +GEMM_LAUNCHER_USM(std::complex, cublasZgemm) + +#undef GEMM_LAUNCHER_USM + +template +inline cl::sycl::event symm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_lower, + int64_t m, int64_t n, T alpha, const T *a, int64_t lda, const T *b, + int64_t ldb, T beta, T *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define SYMM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, \ + int64_t n, TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, \ + int64_t ldb, TYPE beta, TYPE *c, int64_t ldc, \ + const cl::sycl::vector_class &dependencies) { \ + return symm(CUBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, \ + beta, c, ldc, dependencies); \ + } + +SYMM_LAUNCHER_USM(float, cublasSsymm) +SYMM_LAUNCHER_USM(double, cublasDsymm) +SYMM_LAUNCHER_USM(std::complex, cublasCsymm) +SYMM_LAUNCHER_USM(std::complex, cublasZsymm) + +#undef SYMM_LAUNCHER_USM + +template +inline cl::sycl::event hemm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_lower, + int64_t m, int64_t n, T alpha, const T *a, int64_t lda, const T *b, + int64_t ldb, T beta, T *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define HEMM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event hemm(cl::sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, \ + int64_t n, TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, \ + int64_t ldb, TYPE beta, TYPE *c, int64_t ldc, \ + const cl::sycl::vector_class &dependencies) { \ + return hemm(CUBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, \ + beta, c, ldc, dependencies); \ + } +HEMM_LAUNCHER_USM(std::complex, cublasChemm) +HEMM_LAUNCHER_USM(std::complex, cublasZhemm) + +#undef HEMM_LAUNCHER_USM + +template +inline cl::sycl::event syrk(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + int64_t n, int64_t k, T alpha, const T *a, int64_t lda, T beta, T *c, + int64_t ldc, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define SYRK_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, \ + int64_t k, TYPE alpha, const TYPE *a, int64_t lda, TYPE beta, TYPE *c, \ + int64_t ldc, \ + const cl::sycl::vector_class &dependencies) { \ + return syrk(CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, \ + dependencies); \ + } + +SYRK_LAUNCHER_USM(float, cublasSsyrk) +SYRK_LAUNCHER_USM(double, cublasDsyrk) +SYRK_LAUNCHER_USM(std::complex, cublasCsyrk) +SYRK_LAUNCHER_USM(std::complex, cublasZsyrk) + +#undef SYRK_LAUNCHER_USM + +template +inline cl::sycl::event herk(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + int64_t n, int64_t k, ScalarType alpha, const DataType *a, int64_t lda, + ScalarType beta, DataType *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define HERK_LAUNCHER_USM(DATA_TYPE, SCALAR_TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event herk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, \ + int64_t k, SCALAR_TYPE alpha, const DATA_TYPE *a, int64_t lda, \ + SCALAR_TYPE beta, DATA_TYPE *c, int64_t ldc, \ + const cl::sycl::vector_class &dependencies) { \ + return herk(CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, \ + dependencies); \ + } + +HERK_LAUNCHER_USM(std::complex, float, cublasCherk) +HERK_LAUNCHER_USM(std::complex, double, cublasZherk) + +#undef HERK_LAUNCHER_USM + +template +inline cl::sycl::event syr2k(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + int64_t n, int64_t k, T alpha, const T *a, int64_t lda, const T *b, + int64_t ldb, T beta, T *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define SYR2K_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, \ + int64_t k, TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, \ + int64_t ldb, TYPE beta, TYPE *c, int64_t ldc, \ + const cl::sycl::vector_class &dependencies) { \ + return syr2k(CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, \ + c, ldc, dependencies); \ + } +SYR2K_LAUNCHER_USM(float, cublasSsyr2k) +SYR2K_LAUNCHER_USM(double, cublasDsyr2k) +SYR2K_LAUNCHER_USM(std::complex, cublasCsyr2k) +SYR2K_LAUNCHER_USM(std::complex, cublasZsyr2k) + +#undef SYR2K_LAUNCHER_USM + +template +inline cl::sycl::event her2k(Func func, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + int64_t n, int64_t k, DataType alpha, const DataType *a, int64_t lda, + const DataType *b, int64_t ldb, ScalarType beta, DataType *c, + int64_t ldc, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define HER2K_LAUNCHER_USM(DATA_TYPE, SCALAR_TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event her2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, \ + int64_t k, DATA_TYPE alpha, const DATA_TYPE *a, int64_t lda, \ + const DATA_TYPE *b, int64_t ldb, SCALAR_TYPE beta, DATA_TYPE *c, \ + int64_t ldc, \ + const cl::sycl::vector_class &dependencies) { \ + return her2k(CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, \ + c, ldc, dependencies); \ + } + +HER2K_LAUNCHER_USM(std::complex, float, cublasCher2k) +HER2K_LAUNCHER_USM(std::complex, double, cublasZher2k) + +#undef HER2K_LAUNCHER_USM + +// NOTE: In cublas TRMM diverted from the netlib blas and for performance +// reason it requires the C matrix to be +// separated from the B matrix. It is possible to use B instead of C, but this +// will slow-down the code. +template +inline cl::sycl::event trmm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha, + const T *a, int64_t lda, T *b, int64_t ldb, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define TRMM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, \ + transpose trans, diag unit_diag, int64_t m, int64_t n, TYPE alpha, \ + const TYPE *a, int64_t lda, TYPE *b, int64_t ldb, \ + const cl::sycl::vector_class &dependencies) { \ + return trmm(CUBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, \ + a, lda, b, ldb, dependencies); \ + } +TRMM_LAUNCHER_USM(float, cublasStrmm) +TRMM_LAUNCHER_USM(double, cublasDtrmm) +TRMM_LAUNCHER_USM(std::complex, cublasCtrmm) +TRMM_LAUNCHER_USM(std::complex, cublasZtrmm) + +#undef TRMM_LAUNCHER_USM + +template +inline cl::sycl::event trsm(Func func, cl::sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha, + const T *a, int64_t lda, T *b, int64_t ldb, + const cl::sycl::vector_class &dependencies) { + throw backend_unsupported_exception(); +} + +#define TRSM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ + cl::sycl::event trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, \ + transpose trans, diag unit_diag, int64_t m, int64_t n, TYPE alpha, \ + const TYPE *a, int64_t lda, TYPE *b, int64_t ldb, \ + const cl::sycl::vector_class &dependencies) { \ + return trsm(CUBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, \ + a, lda, b, ldb, dependencies); \ + } +TRSM_LAUNCHER_USM(float, cublasStrsm) +TRSM_LAUNCHER_USM(double, cublasDtrsm) +TRSM_LAUNCHER_USM(std::complex, cublasCtrsm) +TRSM_LAUNCHER_USM(std::complex, cublasZtrsm) + +#undef TRSM_LAUNCHER_USM + } // namespace cublas } // namespace onemkl diff --git a/src/blas/backends/cublas/mkl_blas_cublas_wrappers.cpp b/src/blas/backends/cublas/mkl_blas_cublas_wrappers.cpp index 2a6ee6dab..df55993b7 100644 --- a/src/blas/backends/cublas/mkl_blas_cublas_wrappers.cpp +++ b/src/blas/backends/cublas/mkl_blas_cublas_wrappers.cpp @@ -178,14 +178,6 @@ extern "C" function_table_t mkl_blas_table = { onemkl::cublas::gemm_batch, onemkl::cublas::gemm_batch, onemkl::cublas::gemm_batch, - onemkl::cublas::gemm_batch, - onemkl::cublas::gemm_batch, - onemkl::cublas::gemm_batch, - onemkl::cublas::gemm_batch, - onemkl::cublas::trsm_batch, - onemkl::cublas::trsm_batch, - onemkl::cublas::trsm_batch, - onemkl::cublas::trsm_batch, onemkl::cublas::trsm_batch, onemkl::cublas::trsm_batch, onemkl::cublas::trsm_batch, @@ -201,4 +193,170 @@ extern "C" function_table_t mkl_blas_table = { onemkl::cublas::gemm_ext, onemkl::cublas::gemm_ext, onemkl::cublas::gemm_ext, + onemkl::cublas::asum, + onemkl::cublas::asum, + onemkl::cublas::asum, + onemkl::cublas::asum, + onemkl::cublas::axpy, + onemkl::cublas::axpy, + onemkl::cublas::axpy, + onemkl::cublas::axpy, + onemkl::cublas::axpy_batch, + onemkl::cublas::axpy_batch, + onemkl::cublas::axpy_batch, + onemkl::cublas::axpy_batch, + onemkl::cublas::copy, + onemkl::cublas::copy, + onemkl::cublas::copy, + onemkl::cublas::copy, + onemkl::cublas::dot, + onemkl::cublas::dot, + onemkl::cublas::dot, + onemkl::cublas::dotc, + onemkl::cublas::dotc, + onemkl::cublas::dotu, + onemkl::cublas::dotu, + onemkl::cublas::iamin, + onemkl::cublas::iamin, + onemkl::cublas::iamin, + onemkl::cublas::iamin, + onemkl::cublas::iamax, + onemkl::cublas::iamax, + onemkl::cublas::iamax, + onemkl::cublas::iamax, + onemkl::cublas::nrm2, + onemkl::cublas::nrm2, + onemkl::cublas::nrm2, + onemkl::cublas::nrm2, + onemkl::cublas::rot, + onemkl::cublas::rot, + onemkl::cublas::rot, + onemkl::cublas::rot, + onemkl::cublas::rotg, + onemkl::cublas::rotg, + onemkl::cublas::rotg, + onemkl::cublas::rotg, + onemkl::cublas::rotm, + onemkl::cublas::rotm, + onemkl::cublas::rotmg, + onemkl::cublas::rotmg, + onemkl::cublas::scal, + onemkl::cublas::scal, + onemkl::cublas::scal, + onemkl::cublas::scal, + onemkl::cublas::scal, + onemkl::cublas::scal, + onemkl::cublas::sdsdot, + onemkl::cublas::swap, + onemkl::cublas::swap, + onemkl::cublas::swap, + onemkl::cublas::swap, + onemkl::cublas::gbmv, + onemkl::cublas::gbmv, + onemkl::cublas::gbmv, + onemkl::cublas::gbmv, + onemkl::cublas::gemv, + onemkl::cublas::gemv, + onemkl::cublas::gemv, + onemkl::cublas::gemv, + onemkl::cublas::ger, + onemkl::cublas::ger, + onemkl::cublas::gerc, + onemkl::cublas::gerc, + onemkl::cublas::geru, + onemkl::cublas::geru, + onemkl::cublas::hbmv, + onemkl::cublas::hbmv, + onemkl::cublas::hemv, + onemkl::cublas::hemv, + onemkl::cublas::her, + onemkl::cublas::her, + onemkl::cublas::her2, + onemkl::cublas::her2, + onemkl::cublas::hpmv, + onemkl::cublas::hpmv, + onemkl::cublas::hpr, + onemkl::cublas::hpr, + onemkl::cublas::hpr2, + onemkl::cublas::hpr2, + onemkl::cublas::sbmv, + onemkl::cublas::sbmv, + onemkl::cublas::spmv, + onemkl::cublas::spmv, + onemkl::cublas::spr, + onemkl::cublas::spr, + onemkl::cublas::spr2, + onemkl::cublas::spr2, + onemkl::cublas::symv, + onemkl::cublas::symv, + onemkl::cublas::syr, + onemkl::cublas::syr, + onemkl::cublas::syr2, + onemkl::cublas::syr2, + onemkl::cublas::tbmv, + onemkl::cublas::tbmv, + onemkl::cublas::tbmv, + onemkl::cublas::tbmv, + onemkl::cublas::tbsv, + onemkl::cublas::tbsv, + onemkl::cublas::tbsv, + onemkl::cublas::tbsv, + onemkl::cublas::tpmv, + onemkl::cublas::tpmv, + onemkl::cublas::tpmv, + onemkl::cublas::tpmv, + onemkl::cublas::tpsv, + onemkl::cublas::tpsv, + onemkl::cublas::tpsv, + onemkl::cublas::tpsv, + onemkl::cublas::trmv, + onemkl::cublas::trmv, + onemkl::cublas::trmv, + onemkl::cublas::trmv, + onemkl::cublas::trsv, + onemkl::cublas::trsv, + onemkl::cublas::trsv, + onemkl::cublas::trsv, + onemkl::cublas::gemm, + onemkl::cublas::gemm, + onemkl::cublas::gemm, + onemkl::cublas::gemm, + onemkl::cublas::hemm, + onemkl::cublas::hemm, + onemkl::cublas::herk, + onemkl::cublas::herk, + onemkl::cublas::her2k, + onemkl::cublas::her2k, + onemkl::cublas::symm, + onemkl::cublas::symm, + onemkl::cublas::symm, + onemkl::cublas::symm, + onemkl::cublas::syrk, + onemkl::cublas::syrk, + onemkl::cublas::syrk, + onemkl::cublas::syrk, + onemkl::cublas::syr2k, + onemkl::cublas::syr2k, + onemkl::cublas::syr2k, + onemkl::cublas::syr2k, + onemkl::cublas::trmm, + onemkl::cublas::trmm, + onemkl::cublas::trmm, + onemkl::cublas::trmm, + onemkl::cublas::trsm, + onemkl::cublas::trsm, + onemkl::cublas::trsm, + onemkl::cublas::trsm, + onemkl::cublas::gemm_batch, + onemkl::cublas::gemm_batch, + onemkl::cublas::gemm_batch, + onemkl::cublas::gemm_batch, + onemkl::cublas::gemm_batch, + onemkl::cublas::gemm_batch, + onemkl::cublas::gemm_batch, + onemkl::cublas::gemm_batch, + onemkl::cublas::gemmt, + onemkl::cublas::gemmt, + onemkl::cublas::gemmt, + onemkl::cublas::gemmt, }; diff --git a/src/blas/backends/mklcpu/cpu_batch.cpp b/src/blas/backends/mklcpu/cpu_batch.cpp index 43b6670e4..78fc55cb5 100644 --- a/src/blas/backends/mklcpu/cpu_batch.cpp +++ b/src/blas/backends/mklcpu/cpu_batch.cpp @@ -25,398 +25,7 @@ namespace onemkl { namespace mklcpu { -void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, cl::sycl::buffer &beta, - cl::sycl::buffer &c, cl::sycl::buffer &ldc, - int64_t group_count, cl::sycl::buffer &group_size) { - queue.submit([&](cl::sycl::handler &cgh) { - auto transa_acc = transa.get_access(cgh); - auto transb_acc = transb.get_access(cgh); - auto m_acc = m.get_access(cgh); - auto n_acc = n.get_access(cgh); - auto k_acc = k.get_access(cgh); - auto alpha_acc = alpha.get_access(cgh); - auto a_acc = a.get_access(cgh); - auto lda_acc = lda.get_access(cgh); - auto b_acc = b.get_access(cgh); - auto ldb_acc = ldb.get_access(cgh); - auto beta_acc = beta.get_access(cgh); - auto c_acc = c.get_access(cgh); - auto ldc_acc = ldc.get_access(cgh); - auto group_size_acc = group_size.get_access(cgh); - - host_task(cgh, [=]() { - int64_t total_size = 0; - - for (int64_t i = 0; i < group_count; i++) { - total_size += group_size_acc[i]; - } - - float **a_array = (float **)::malloc(sizeof(float *) * total_size); - float **b_array = (float **)::malloc(sizeof(float *) * total_size); - float **c_array = (float **)::malloc(sizeof(float *) * total_size); - MKL_INT *m_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *n_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *k_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *lda_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *ldb_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *ldc_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *group_size_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - int64_t offset_a = 0, offset_b = 0, offset_c = 0, idx = 0; - char *transa_ = (char *)::malloc(sizeof(char) * group_count); - char *transb_ = (char *)::malloc(sizeof(char) * group_count); - - for (int64_t i = 0; i < group_count; i++) { - m_[i] = m_acc[i]; - n_[i] = n_acc[i]; - k_[i] = k_acc[i]; - lda_[i] = lda_acc[i]; - ldb_[i] = ldb_acc[i]; - ldc_[i] = ldc_acc[i]; - group_size_[i] = group_size_acc[i]; - transa_[i] = *fortran_char(transa_acc[i]); - transb_[i] = *fortran_char(transb_acc[i]); - - for (int64_t j = 0; j < group_size_acc[i]; j++) { - if (idx == 0) { - a_array[0] = a_acc.get_pointer(); - b_array[0] = b_acc.get_pointer(); - c_array[0] = c_acc.get_pointer(); - } - else { - a_array[idx] = a_array[idx - 1] + offset_a; - b_array[idx] = b_array[idx - 1] + offset_b; - c_array[idx] = c_array[idx - 1] + offset_c; - } - idx++; - offset_a = (transa_acc[i] == transpose::nontrans) ? lda_acc[i] * k_acc[i] - : lda_acc[i] * m_acc[i]; - offset_b = (transb_acc[i] == transpose::nontrans) ? ldb_acc[i] * n_acc[i] - : ldb_acc[i] * k_acc[i]; - offset_c = ldc_acc[i] * n_acc[i]; - } - } - - ::sgemm_batch(transa_, transb_, m_, n_, k_, alpha_acc.get_pointer(), - (const float **)a_array, lda_, (const float **)b_array, ldb_, - beta_acc.get_pointer(), c_array, ldc_, (MKL_INT *)&group_count, - group_size_); - - ::free(a_array); - ::free(b_array); - ::free(c_array); - ::free(m_); - ::free(n_); - ::free(k_); - ::free(lda_); - ::free(ldb_); - ::free(ldc_); - ::free(group_size_); - ::free(transa_); - ::free(transb_); - }); - }); -} - -void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, cl::sycl::buffer &beta, - cl::sycl::buffer &c, cl::sycl::buffer &ldc, - int64_t group_count, cl::sycl::buffer &group_size) { - queue.submit([&](cl::sycl::handler &cgh) { - auto transa_acc = transa.get_access(cgh); - auto transb_acc = transb.get_access(cgh); - auto m_acc = m.get_access(cgh); - auto n_acc = n.get_access(cgh); - auto k_acc = k.get_access(cgh); - auto alpha_acc = alpha.get_access(cgh); - auto a_acc = a.get_access(cgh); - auto lda_acc = lda.get_access(cgh); - auto b_acc = b.get_access(cgh); - auto ldb_acc = ldb.get_access(cgh); - auto beta_acc = beta.get_access(cgh); - auto c_acc = c.get_access(cgh); - auto ldc_acc = ldc.get_access(cgh); - auto group_size_acc = group_size.get_access(cgh); - - host_task(cgh, [=]() { - int64_t total_size = 0; - - for (int64_t i = 0; i < group_count; i++) { - total_size += group_size_acc[i]; - } - - double **a_array = (double **)::malloc(sizeof(double *) * total_size); - double **b_array = (double **)::malloc(sizeof(double *) * total_size); - double **c_array = (double **)::malloc(sizeof(double *) * total_size); - MKL_INT *m_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *n_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *k_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *lda_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *ldb_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *ldc_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *group_size_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - int64_t offset_a = 0, offset_b = 0, offset_c = 0, idx = 0; - char *transa_ = (char *)::malloc(sizeof(char) * group_count); - char *transb_ = (char *)::malloc(sizeof(char) * group_count); - - for (int64_t i = 0; i < group_count; i++) { - m_[i] = m_acc[i]; - n_[i] = n_acc[i]; - k_[i] = k_acc[i]; - lda_[i] = lda_acc[i]; - ldb_[i] = ldb_acc[i]; - ldc_[i] = ldc_acc[i]; - group_size_[i] = group_size_acc[i]; - transa_[i] = *fortran_char(transa_acc[i]); - transb_[i] = *fortran_char(transb_acc[i]); - - for (int64_t j = 0; j < group_size_acc[i]; j++) { - if (idx == 0) { - a_array[0] = a_acc.get_pointer(); - b_array[0] = b_acc.get_pointer(); - c_array[0] = c_acc.get_pointer(); - } - else { - a_array[idx] = a_array[idx - 1] + offset_a; - b_array[idx] = b_array[idx - 1] + offset_b; - c_array[idx] = c_array[idx - 1] + offset_c; - } - idx++; - offset_a = (transa_acc[i] == transpose::nontrans) ? lda_acc[i] * k_acc[i] - : lda_acc[i] * m_acc[i]; - offset_b = (transb_acc[i] == transpose::nontrans) ? ldb_acc[i] * n_acc[i] - : ldb_acc[i] * k_acc[i]; - offset_c = ldc_acc[i] * n_acc[i]; - } - } - - ::dgemm_batch(transa_, transb_, m_, n_, k_, alpha_acc.get_pointer(), - (const double **)a_array, lda_, (const double **)b_array, ldb_, - beta_acc.get_pointer(), c_array, ldc_, (MKL_INT *)&group_count, - group_size_); - - ::free(a_array); - ::free(b_array); - ::free(c_array); - ::free(m_); - ::free(n_); - ::free(k_); - ::free(lda_); - ::free(ldb_); - ::free(ldc_); - ::free(group_size_); - ::free(transa_); - ::free(transb_); - }); - }); -} - -void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - int64_t group_count, cl::sycl::buffer &group_size) { - queue.submit([&](cl::sycl::handler &cgh) { - auto transa_acc = transa.get_access(cgh); - auto transb_acc = transb.get_access(cgh); - auto m_acc = m.get_access(cgh); - auto n_acc = n.get_access(cgh); - auto k_acc = k.get_access(cgh); - auto alpha_acc = alpha.get_access(cgh); - auto a_acc = a.get_access(cgh); - auto lda_acc = lda.get_access(cgh); - auto b_acc = b.get_access(cgh); - auto ldb_acc = ldb.get_access(cgh); - auto beta_acc = beta.get_access(cgh); - auto c_acc = c.get_access(cgh); - auto ldc_acc = ldc.get_access(cgh); - auto group_size_acc = group_size.get_access(cgh); - - host_task(cgh, [=]() { - int64_t total_size = 0; - - for (int64_t i = 0; i < group_count; i++) { - total_size += group_size_acc[i]; - } - - MKL_Complex8 **a_array = (MKL_Complex8 **)::malloc(sizeof(MKL_Complex8 *) * total_size); - MKL_Complex8 **b_array = (MKL_Complex8 **)::malloc(sizeof(MKL_Complex8 *) * total_size); - MKL_Complex8 **c_array = (MKL_Complex8 **)::malloc(sizeof(MKL_Complex8 *) * total_size); - MKL_INT *m_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *n_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *k_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *lda_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *ldb_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *ldc_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *group_size_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - int64_t offset_a = 0, offset_b = 0, offset_c = 0, idx = 0; - char *transa_ = (char *)::malloc(sizeof(char) * group_count); - char *transb_ = (char *)::malloc(sizeof(char) * group_count); - - for (int64_t i = 0; i < group_count; i++) { - m_[i] = m_acc[i]; - n_[i] = n_acc[i]; - k_[i] = k_acc[i]; - lda_[i] = lda_acc[i]; - ldb_[i] = ldb_acc[i]; - ldc_[i] = ldc_acc[i]; - group_size_[i] = group_size_acc[i]; - transa_[i] = *fortran_char(transa_acc[i]); - transb_[i] = *fortran_char(transb_acc[i]); - - for (int64_t j = 0; j < group_size_acc[i]; j++) { - if (idx == 0) { - a_array[0] = a_acc.get_pointer(); - b_array[0] = b_acc.get_pointer(); - c_array[0] = c_acc.get_pointer(); - } - else { - a_array[idx] = a_array[idx - 1] + offset_a; - b_array[idx] = b_array[idx - 1] + offset_b; - c_array[idx] = c_array[idx - 1] + offset_c; - } - idx++; - offset_a = (transa_acc[i] == transpose::nontrans) ? lda_acc[i] * k_acc[i] - : lda_acc[i] * m_acc[i]; - offset_b = (transb_acc[i] == transpose::nontrans) ? ldb_acc[i] * n_acc[i] - : ldb_acc[i] * k_acc[i]; - offset_c = ldc_acc[i] * n_acc[i]; - } - } - - ::cgemm_batch(transa_, transb_, m_, n_, k_, alpha_acc.get_pointer(), - (const MKL_Complex8 **)a_array, lda_, (const MKL_Complex8 **)b_array, - ldb_, beta_acc.get_pointer(), c_array, ldc_, (MKL_INT *)&group_count, - group_size_); - - ::free(a_array); - ::free(b_array); - ::free(c_array); - ::free(m_); - ::free(n_); - ::free(k_); - ::free(lda_); - ::free(ldb_); - ::free(ldc_); - ::free(group_size_); - ::free(transa_); - ::free(transb_); - }); - }); -} - -void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - int64_t group_count, cl::sycl::buffer &group_size) { - queue.submit([&](cl::sycl::handler &cgh) { - auto transa_acc = transa.get_access(cgh); - auto transb_acc = transb.get_access(cgh); - auto m_acc = m.get_access(cgh); - auto n_acc = n.get_access(cgh); - auto k_acc = k.get_access(cgh); - auto alpha_acc = alpha.get_access(cgh); - auto a_acc = a.get_access(cgh); - auto lda_acc = lda.get_access(cgh); - auto b_acc = b.get_access(cgh); - auto ldb_acc = ldb.get_access(cgh); - auto beta_acc = beta.get_access(cgh); - auto c_acc = c.get_access(cgh); - auto ldc_acc = ldc.get_access(cgh); - auto group_size_acc = group_size.get_access(cgh); - - host_task(cgh, [=]() { - int64_t total_size = 0; - - for (int64_t i = 0; i < group_count; i++) { - total_size += group_size_acc[i]; - } - - MKL_Complex16 **a_array = - (MKL_Complex16 **)::malloc(sizeof(MKL_Complex16 *) * total_size); - MKL_Complex16 **b_array = - (MKL_Complex16 **)::malloc(sizeof(MKL_Complex16 *) * total_size); - MKL_Complex16 **c_array = - (MKL_Complex16 **)::malloc(sizeof(MKL_Complex16 *) * total_size); - MKL_INT *m_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *n_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *k_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *lda_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *ldb_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *ldc_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *group_size_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - int64_t offset_a = 0, offset_b = 0, offset_c = 0, idx = 0; - char *transa_ = (char *)::malloc(sizeof(char) * group_count); - char *transb_ = (char *)::malloc(sizeof(char) * group_count); - - for (int64_t i = 0; i < group_count; i++) { - m_[i] = m_acc[i]; - n_[i] = n_acc[i]; - k_[i] = k_acc[i]; - lda_[i] = lda_acc[i]; - ldb_[i] = ldb_acc[i]; - ldc_[i] = ldc_acc[i]; - group_size_[i] = group_size_acc[i]; - transa_[i] = *fortran_char(transa_acc[i]); - transb_[i] = *fortran_char(transb_acc[i]); - - for (int64_t j = 0; j < group_size_acc[i]; j++) { - if (idx == 0) { - a_array[0] = a_acc.get_pointer(); - b_array[0] = b_acc.get_pointer(); - c_array[0] = c_acc.get_pointer(); - } - else { - a_array[idx] = a_array[idx - 1] + offset_a; - b_array[idx] = b_array[idx - 1] + offset_b; - c_array[idx] = c_array[idx - 1] + offset_c; - } - idx++; - offset_a = (transa_acc[i] == transpose::nontrans) ? lda_acc[i] * k_acc[i] - : lda_acc[i] * m_acc[i]; - offset_b = (transb_acc[i] == transpose::nontrans) ? ldb_acc[i] * n_acc[i] - : ldb_acc[i] * k_acc[i]; - offset_c = ldc_acc[i] * n_acc[i]; - } - } - - ::zgemm_batch(transa_, transb_, m_, n_, k_, alpha_acc.get_pointer(), - (const MKL_Complex16 **)a_array, lda_, (const MKL_Complex16 **)b_array, - ldb_, beta_acc.get_pointer(), c_array, ldc_, (MKL_INT *)&group_count, - group_size_); - - ::free(a_array); - ::free(b_array); - ::free(c_array); - ::free(m_); - ::free(n_); - ::free(k_); - ::free(lda_); - ::free(ldb_); - ::free(ldc_); - ::free(group_size_); - ::free(transa_); - ::free(transb_); - }); - }); -} +// Buffer APIs void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, float alpha, cl::sycl::buffer &a, int64_t lda, @@ -435,6 +44,13 @@ void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, int6 float **a_array = (float **)::malloc(sizeof(float *) * batch_size); float **b_array = (float **)::malloc(sizeof(float *) * batch_size); float **c_array = (float **)::malloc(sizeof(float *) * batch_size); + if ((a_array == NULL) || (b_array == NULL) || (c_array == NULL)) { + std::cout << "Error cannot allocate input arrays\n"; + ::free(a_array); + ::free(b_array); + ::free(c_array); + return; + } for (int64_t i = 0; i < batch_size; i++) { if (i == 0) { @@ -479,6 +95,13 @@ void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, int6 double **a_array = (double **)::malloc(sizeof(double *) * batch_size); double **b_array = (double **)::malloc(sizeof(double *) * batch_size); double **c_array = (double **)::malloc(sizeof(double *) * batch_size); + if ((a_array == NULL) || (b_array == NULL) || (c_array == NULL)) { + std::cout << "Error cannot allocate input arrays\n"; + ::free(a_array); + ::free(b_array); + ::free(c_array); + return; + } for (int64_t i = 0; i < batch_size; i++) { if (i == 0) { @@ -524,6 +147,13 @@ void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, int6 MKL_Complex8 **a_array = (MKL_Complex8 **)::malloc(sizeof(MKL_Complex8 *) * batch_size); MKL_Complex8 **b_array = (MKL_Complex8 **)::malloc(sizeof(MKL_Complex8 *) * batch_size); MKL_Complex8 **c_array = (MKL_Complex8 **)::malloc(sizeof(MKL_Complex8 *) * batch_size); + if ((a_array == NULL) || (b_array == NULL) || (c_array == NULL)) { + std::cout << "Error cannot allocate input arrays\n"; + ::free(a_array); + ::free(b_array); + ::free(c_array); + return; + } for (int64_t i = 0; i < batch_size; i++) { if (i == 0) { @@ -572,6 +202,13 @@ void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, int6 (MKL_Complex16 **)::malloc(sizeof(MKL_Complex16 *) * batch_size); MKL_Complex16 **c_array = (MKL_Complex16 **)::malloc(sizeof(MKL_Complex16 *) * batch_size); + if ((a_array == NULL) || (b_array == NULL) || (c_array == NULL)) { + std::cout << "Error cannot allocate input arrays\n"; + ::free(a_array); + ::free(b_array); + ::free(c_array); + return; + } for (int64_t i = 0; i < batch_size; i++) { if (i == 0) { @@ -599,92 +236,6 @@ void gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, int6 }); } -void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, - int64_t group_count, cl::sycl::buffer &group_size) { - queue.submit([&](cl::sycl::handler &cgh) { - auto side_acc = left_right.get_access(cgh); - auto uplo_acc = upper_lower.get_access(cgh); - auto trans_acc = trans.get_access(cgh); - auto diag_acc = unit_diag.get_access(cgh); - auto m_acc = m.get_access(cgh); - auto n_acc = n.get_access(cgh); - auto alpha_acc = alpha.get_access(cgh); - auto a_acc = a.get_access(cgh); - auto lda_acc = lda.get_access(cgh); - auto b_acc = b.get_access(cgh); - auto ldb_acc = ldb.get_access(cgh); - auto group_size_acc = group_size.get_access(cgh); - host_task(cgh, [=]() { - int64_t total_size = 0; - - for (int64_t i = 0; i < group_count; i++) { - total_size += group_size_acc[i]; - } - - float **a_array = (float **)::malloc(sizeof(float *) * total_size); - float **b_array = (float **)::malloc(sizeof(float *) * total_size); - MKL_INT *m_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *n_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *lda_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *ldb_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *group_size_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - int64_t offset_a = 0, offset_b = 0, idx = 0; - char *side_ = (char *)::malloc(sizeof(char) * group_count); - char *uplo_ = (char *)::malloc(sizeof(char) * group_count); - char *trans_ = (char *)::malloc(sizeof(char) * group_count); - char *diag_ = (char *)::malloc(sizeof(char) * group_count); - - for (int64_t i = 0; i < group_count; i++) { - m_[i] = m_acc[i]; - n_[i] = n_acc[i]; - lda_[i] = lda_acc[i]; - ldb_[i] = ldb_acc[i]; - group_size_[i] = group_size_acc[i]; - trans_[i] = *fortran_char(trans_acc[i]); - side_[i] = *fortran_char(side_acc[i]); - uplo_[i] = *fortran_char(uplo_acc[i]); - diag_[i] = *fortran_char(diag_acc[i]); - - for (int64_t j = 0; j < group_size_acc[i]; j++) { - if (idx == 0) { - a_array[0] = a_acc.get_pointer(); - b_array[0] = b_acc.get_pointer(); - } - else { - a_array[idx] = a_array[idx - 1] + offset_a; - b_array[idx] = b_array[idx - 1] + offset_b; - } - idx++; - offset_a = - (side_acc[i] == side::left) ? lda_acc[i] * m_acc[i] : lda_acc[i] * n_acc[i]; - offset_b = ldb_acc[i] * n_acc[i]; - } - } - - ::strsm_batch(side_, uplo_, trans_, diag_, m_, n_, alpha_acc.get_pointer(), - (const float **)a_array, lda_, (float **)b_array, ldb_, - (MKL_INT *)&group_count, group_size_); - - ::free(a_array); - ::free(b_array); - ::free(m_); - ::free(n_); - ::free(lda_); - ::free(ldb_); - ::free(group_size_); - ::free(side_); - ::free(uplo_); - ::free(trans_); - ::free(diag_); - }); - }); -} - void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, float alpha, cl::sycl::buffer &a, int64_t lda, int64_t stride_a, cl::sycl::buffer &b, int64_t ldb, @@ -701,6 +252,12 @@ void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, trans host_task(cgh, [=]() { float **a_array = (float **)::malloc(sizeof(float *) * batch_size); float **b_array = (float **)::malloc(sizeof(float *) * batch_size); + if ((a_array == NULL) || (b_array == NULL)) { + std::cout << "Error cannot allocate input arrays\n"; + ::free(a_array); + ::free(b_array); + return; + } for (int64_t i = 0; i < batch_size; i++) { if (i == 0) { @@ -724,92 +281,6 @@ void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, trans }); } -void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, - int64_t group_count, cl::sycl::buffer &group_size) { - queue.submit([&](cl::sycl::handler &cgh) { - auto side_acc = left_right.get_access(cgh); - auto uplo_acc = upper_lower.get_access(cgh); - auto trans_acc = trans.get_access(cgh); - auto diag_acc = unit_diag.get_access(cgh); - auto m_acc = m.get_access(cgh); - auto n_acc = n.get_access(cgh); - auto alpha_acc = alpha.get_access(cgh); - auto a_acc = a.get_access(cgh); - auto lda_acc = lda.get_access(cgh); - auto b_acc = b.get_access(cgh); - auto ldb_acc = ldb.get_access(cgh); - auto group_size_acc = group_size.get_access(cgh); - host_task(cgh, [=]() { - int64_t total_size = 0; - - for (int64_t i = 0; i < group_count; i++) { - total_size += group_size_acc[i]; - } - - double **a_array = (double **)::malloc(sizeof(double *) * total_size); - double **b_array = (double **)::malloc(sizeof(double *) * total_size); - MKL_INT *m_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *n_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *lda_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *ldb_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *group_size_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - int64_t offset_a = 0, offset_b = 0, idx = 0; - char *side_ = (char *)::malloc(sizeof(char) * group_count); - char *uplo_ = (char *)::malloc(sizeof(char) * group_count); - char *trans_ = (char *)::malloc(sizeof(char) * group_count); - char *diag_ = (char *)::malloc(sizeof(char) * group_count); - - for (int64_t i = 0; i < group_count; i++) { - m_[i] = m_acc[i]; - n_[i] = n_acc[i]; - lda_[i] = lda_acc[i]; - ldb_[i] = ldb_acc[i]; - group_size_[i] = group_size_acc[i]; - trans_[i] = *fortran_char(trans_acc[i]); - side_[i] = *fortran_char(side_acc[i]); - uplo_[i] = *fortran_char(uplo_acc[i]); - diag_[i] = *fortran_char(diag_acc[i]); - - for (int64_t j = 0; j < group_size_acc[i]; j++) { - if (idx == 0) { - a_array[0] = a_acc.get_pointer(); - b_array[0] = b_acc.get_pointer(); - } - else { - a_array[idx] = a_array[idx - 1] + offset_a; - b_array[idx] = b_array[idx - 1] + offset_b; - } - idx++; - offset_a = - (side_acc[i] == side::left) ? lda_acc[i] * m_acc[i] : lda_acc[i] * n_acc[i]; - offset_b = ldb_acc[i] * n_acc[i]; - } - } - - ::dtrsm_batch(side_, uplo_, trans_, diag_, m_, n_, alpha_acc.get_pointer(), - (const double **)a_array, lda_, (double **)b_array, ldb_, - (MKL_INT *)&group_count, group_size_); - - ::free(a_array); - ::free(b_array); - ::free(m_); - ::free(n_); - ::free(lda_); - ::free(ldb_); - ::free(group_size_); - ::free(side_); - ::free(uplo_); - ::free(trans_); - ::free(diag_); - }); - }); -} - void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, double alpha, cl::sycl::buffer &a, int64_t lda, int64_t stride_a, cl::sycl::buffer &b, int64_t ldb, @@ -826,6 +297,12 @@ void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, trans host_task(cgh, [=]() { double **a_array = (double **)::malloc(sizeof(double *) * batch_size); double **b_array = (double **)::malloc(sizeof(double *) * batch_size); + if ((a_array == NULL) || (b_array == NULL)) { + std::cout << "Error cannot allocate input arrays\n"; + ::free(a_array); + ::free(b_array); + return; + } for (int64_t i = 0; i < batch_size; i++) { if (i == 0) { @@ -849,92 +326,6 @@ void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, trans }); } -void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - int64_t group_count, cl::sycl::buffer &group_size) { - queue.submit([&](cl::sycl::handler &cgh) { - auto side_acc = left_right.get_access(cgh); - auto uplo_acc = upper_lower.get_access(cgh); - auto trans_acc = trans.get_access(cgh); - auto diag_acc = unit_diag.get_access(cgh); - auto m_acc = m.get_access(cgh); - auto n_acc = n.get_access(cgh); - auto alpha_acc = alpha.get_access(cgh); - auto a_acc = a.get_access(cgh); - auto lda_acc = lda.get_access(cgh); - auto b_acc = b.get_access(cgh); - auto ldb_acc = ldb.get_access(cgh); - auto group_size_acc = group_size.get_access(cgh); - host_task(cgh, [=]() { - int64_t total_size = 0; - - for (int64_t i = 0; i < group_count; i++) { - total_size += group_size_acc[i]; - } - - MKL_Complex8 **a_array = (MKL_Complex8 **)::malloc(sizeof(MKL_Complex8 *) * total_size); - MKL_Complex8 **b_array = (MKL_Complex8 **)::malloc(sizeof(MKL_Complex8 *) * total_size); - MKL_INT *m_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *n_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *lda_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *ldb_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *group_size_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - int64_t offset_a = 0, offset_b = 0, idx = 0; - char *side_ = (char *)::malloc(sizeof(char) * group_count); - char *uplo_ = (char *)::malloc(sizeof(char) * group_count); - char *trans_ = (char *)::malloc(sizeof(char) * group_count); - char *diag_ = (char *)::malloc(sizeof(char) * group_count); - - for (int64_t i = 0; i < group_count; i++) { - m_[i] = m_acc[i]; - n_[i] = n_acc[i]; - lda_[i] = lda_acc[i]; - ldb_[i] = ldb_acc[i]; - group_size_[i] = group_size_acc[i]; - trans_[i] = *fortran_char(trans_acc[i]); - side_[i] = *fortran_char(side_acc[i]); - uplo_[i] = *fortran_char(uplo_acc[i]); - diag_[i] = *fortran_char(diag_acc[i]); - - for (int64_t j = 0; j < group_size_acc[i]; j++) { - if (idx == 0) { - a_array[0] = a_acc.get_pointer(); - b_array[0] = b_acc.get_pointer(); - } - else { - a_array[idx] = a_array[idx - 1] + offset_a; - b_array[idx] = b_array[idx - 1] + offset_b; - } - idx++; - offset_a = - (side_acc[i] == side::left) ? lda_acc[i] * m_acc[i] : lda_acc[i] * n_acc[i]; - offset_b = ldb_acc[i] * n_acc[i]; - } - } - - ::ctrsm_batch(side_, uplo_, trans_, diag_, m_, n_, alpha_acc.get_pointer(), - (const MKL_Complex8 **)a_array, lda_, (MKL_Complex8 **)b_array, ldb_, - (MKL_INT *)&group_count, group_size_); - - ::free(a_array); - ::free(b_array); - ::free(m_); - ::free(n_); - ::free(lda_); - ::free(ldb_); - ::free(group_size_); - ::free(side_); - ::free(uplo_); - ::free(trans_); - ::free(diag_); - }); - }); -} - void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, @@ -952,6 +343,12 @@ void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, trans host_task(cgh, [=]() { MKL_Complex8 **a_array = (MKL_Complex8 **)::malloc(sizeof(MKL_Complex8 *) * batch_size); MKL_Complex8 **b_array = (MKL_Complex8 **)::malloc(sizeof(MKL_Complex8 *) * batch_size); + if ((a_array == NULL) || (b_array == NULL)) { + std::cout << "Error cannot allocate input arrays\n"; + ::free(a_array); + ::free(b_array); + return; + } for (int64_t i = 0; i < batch_size; i++) { if (i == 0) { @@ -975,94 +372,6 @@ void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, trans }); } -void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - int64_t group_count, cl::sycl::buffer &group_size) { - queue.submit([&](cl::sycl::handler &cgh) { - auto side_acc = left_right.get_access(cgh); - auto uplo_acc = upper_lower.get_access(cgh); - auto trans_acc = trans.get_access(cgh); - auto diag_acc = unit_diag.get_access(cgh); - auto m_acc = m.get_access(cgh); - auto n_acc = n.get_access(cgh); - auto alpha_acc = alpha.get_access(cgh); - auto a_acc = a.get_access(cgh); - auto lda_acc = lda.get_access(cgh); - auto b_acc = b.get_access(cgh); - auto ldb_acc = ldb.get_access(cgh); - auto group_size_acc = group_size.get_access(cgh); - - host_task(cgh, [=]() { - int64_t total_size = 0; - - for (int64_t i = 0; i < group_count; i++) { - total_size += group_size_acc[i]; - } - - MKL_Complex16 **a_array = - (MKL_Complex16 **)::malloc(sizeof(MKL_Complex16 *) * total_size); - MKL_Complex16 **b_array = - (MKL_Complex16 **)::malloc(sizeof(MKL_Complex16 *) * total_size); - MKL_INT *m_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *n_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *lda_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *ldb_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - MKL_INT *group_size_ = (MKL_INT *)::malloc(sizeof(MKL_INT) * group_count); - int64_t offset_a = 0, offset_b = 0, idx = 0; - char *side_ = (char *)::malloc(sizeof(char) * group_count); - char *uplo_ = (char *)::malloc(sizeof(char) * group_count); - char *trans_ = (char *)::malloc(sizeof(char) * group_count); - char *diag_ = (char *)::malloc(sizeof(char) * group_count); - - for (int64_t i = 0; i < group_count; i++) { - m_[i] = m_acc[i]; - n_[i] = n_acc[i]; - lda_[i] = lda_acc[i]; - ldb_[i] = ldb_acc[i]; - group_size_[i] = group_size_acc[i]; - trans_[i] = *fortran_char(trans_acc[i]); - side_[i] = *fortran_char(side_acc[i]); - uplo_[i] = *fortran_char(uplo_acc[i]); - diag_[i] = *fortran_char(diag_acc[i]); - for (int64_t j = 0; j < group_size_acc[i]; j++) { - if (idx == 0) { - a_array[0] = a_acc.get_pointer(); - b_array[0] = b_acc.get_pointer(); - } - else { - a_array[idx] = a_array[idx - 1] + offset_a; - b_array[idx] = b_array[idx - 1] + offset_b; - } - idx++; - offset_a = - (side_acc[i] == side::left) ? lda_acc[i] * m_acc[i] : lda_acc[i] * n_acc[i]; - offset_b = ldb_acc[i] * n_acc[i]; - } - } - - ::ztrsm_batch(side_, uplo_, trans_, diag_, m_, n_, alpha_acc.get_pointer(), - (const MKL_Complex16 **)a_array, lda_, (MKL_Complex16 **)b_array, ldb_, - (MKL_INT *)&group_count, group_size_); - - ::free(a_array); - ::free(b_array); - ::free(m_); - ::free(n_); - ::free(lda_); - ::free(ldb_); - ::free(group_size_); - ::free(side_); - ::free(uplo_); - ::free(trans_); - ::free(diag_); - }); - }); -} - void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, @@ -1081,6 +390,12 @@ void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, trans (MKL_Complex16 **)::malloc(sizeof(MKL_Complex16 *) * batch_size); MKL_Complex16 **b_array = (MKL_Complex16 **)::malloc(sizeof(MKL_Complex16 *) * batch_size); + if ((a_array == NULL) || (b_array == NULL)) { + std::cout << "Error cannot allocate input arrays\n"; + ::free(a_array); + ::free(b_array); + return; + } for (int64_t i = 0; i < batch_size; i++) { if (i == 0) { @@ -1104,5 +419,458 @@ void trsm_batch(cl::sycl::queue &queue, side left_right, uplo upper_lower, trans }); } +// USM APIs + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, + int64_t *n, int64_t *k, float *alpha, const float **a, int64_t *lda, + const float **b, int64_t *ldb, float *beta, float **c, int64_t *ldc, + int64_t group_count, int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + char *transa_ = (char *)::malloc(sizeof(char) * group_count); + char *transb_ = (char *)::malloc(sizeof(char) * group_count); + if ((transa_ == NULL) || (transb_ == NULL)) { + std::cout << "Error cannot allocate trans arrays\n"; + ::free(transa_); + ::free(transb_); + return; + } + for (int64_t i = 0; i < group_count; i++) { + transa_[i] = *fortran_char(transa[i]); + transb_[i] = *fortran_char(transb[i]); + } + ::sgemm_batch(transa_, transb_, (const MKL_INT *)m, (const MKL_INT *)n, + (const MKL_INT *)k, alpha, (const float **)a, (const MKL_INT *)lda, + (const float **)b, (const MKL_INT *)ldb, beta, c, (const MKL_INT *)ldc, + (const MKL_INT *)&group_count, (const MKL_INT *)group_size); + ::free(transa_); + ::free(transb_); + }); + }); + return done; +} + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, + int64_t *n, int64_t *k, double *alpha, const double **a, int64_t *lda, + const double **b, int64_t *ldb, double *beta, double **c, int64_t *ldc, + int64_t group_count, int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + char *transa_ = (char *)::malloc(sizeof(char) * group_count); + char *transb_ = (char *)::malloc(sizeof(char) * group_count); + if ((transa_ == NULL) || (transb_ == NULL)) { + std::cout << "Error cannot allocate trans arrays\n"; + ::free(transa_); + ::free(transb_); + return; + } + for (int64_t i = 0; i < group_count; i++) { + transa_[i] = *fortran_char(transa[i]); + transb_[i] = *fortran_char(transb[i]); + } + ::dgemm_batch(transa_, transb_, (const MKL_INT *)m, (const MKL_INT *)n, + (const MKL_INT *)k, alpha, (const double **)a, (const MKL_INT *)lda, + (const double **)b, (const MKL_INT *)ldb, beta, c, (const MKL_INT *)ldc, + (const MKL_INT *)&group_count, (const MKL_INT *)group_size); + ::free(transa_); + ::free(transb_); + }); + }); + return done; +} + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, + int64_t *n, int64_t *k, std::complex *alpha, + const std::complex **a, int64_t *lda, + const std::complex **b, int64_t *ldb, std::complex *beta, + std::complex **c, int64_t *ldc, int64_t group_count, + int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + char *transa_ = (char *)::malloc(sizeof(char) * group_count); + char *transb_ = (char *)::malloc(sizeof(char) * group_count); + if ((transa_ == NULL) || (transb_ == NULL)) { + std::cout << "Error cannot allocate trans arrays\n"; + ::free(transa_); + ::free(transb_); + return; + } + for (int64_t i = 0; i < group_count; i++) { + transa_[i] = *fortran_char(transa[i]); + transb_[i] = *fortran_char(transb[i]); + } + ::cgemm_batch(transa_, transb_, (const MKL_INT *)m, (const MKL_INT *)n, + (const MKL_INT *)k, alpha, (const std::complex **)a, + (const MKL_INT *)lda, (const std::complex **)b, + (const MKL_INT *)ldb, beta, c, (const MKL_INT *)ldc, + (const MKL_INT *)&group_count, (const MKL_INT *)group_size); + ::free(transa_); + ::free(transb_); + }); + }); + return done; +} + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, + int64_t *n, int64_t *k, std::complex *alpha, + const std::complex **a, int64_t *lda, + const std::complex **b, int64_t *ldb, std::complex *beta, + std::complex **c, int64_t *ldc, int64_t group_count, + int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + char *transa_ = (char *)::malloc(sizeof(char) * group_count); + char *transb_ = (char *)::malloc(sizeof(char) * group_count); + if ((transa_ == NULL) || (transb_ == NULL)) { + std::cout << "Error cannot allocate trans arrays\n"; + ::free(transa_); + ::free(transb_); + return; + } + for (int64_t i = 0; i < group_count; i++) { + transa_[i] = *fortran_char(transa[i]); + transb_[i] = *fortran_char(transb[i]); + } + ::zgemm_batch(transa_, transb_, (const MKL_INT *)m, (const MKL_INT *)n, + (const MKL_INT *)k, alpha, (const std::complex **)a, + (const MKL_INT *)lda, (const std::complex **)b, + (const MKL_INT *)ldb, beta, c, (const MKL_INT *)ldc, + (const MKL_INT *)&group_count, (const MKL_INT *)group_size); + ::free(transa_); + ::free(transb_); + }); + }); + return done; +} + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m, + int64_t n, int64_t k, float alpha, const float *a, int64_t lda, + int64_t stride_a, const float *b, int64_t ldb, int64_t stride_b, + float beta, float *c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char transa_ = *fortran_char(transa); + const char transb_ = *fortran_char(transb); + MKL_INT one = 1; + host_task(cgh, [=]() { + float **a_array = (float **)::malloc(sizeof(float *) * batch_size); + float **b_array = (float **)::malloc(sizeof(float *) * batch_size); + float **c_array = (float **)::malloc(sizeof(float *) * batch_size); + if ((a_array == NULL) || (b_array == NULL) || (c_array == NULL)) { + std::cout << "Error cannot allocate input arrays\n"; + ::free(a_array); + ::free(b_array); + ::free(c_array); + return; + } + for (int64_t i = 0; i < batch_size; i++) { + if (i == 0) { + a_array[0] = (float *)a; + b_array[0] = (float *)b; + c_array[0] = (float *)c; + } + else { + a_array[i] = a_array[i - 1] + stride_a; + b_array[i] = b_array[i - 1] + stride_b; + c_array[i] = c_array[i - 1] + stride_c; + } + } + ::sgemm_batch(&transa_, &transb_, (const MKL_INT *)&m, (const MKL_INT *)&n, + (const MKL_INT *)&k, &alpha, (const float **)a_array, + (const MKL_INT *)&lda, (const float **)b_array, (const MKL_INT *)&ldb, + &beta, c_array, (const MKL_INT *)&ldc, (const MKL_INT *)&one, + (const MKL_INT *)&batch_size); + + ::free(a_array); + ::free(b_array); + ::free(c_array); + }); + }); + return done; +} + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m, + int64_t n, int64_t k, double alpha, const double *a, int64_t lda, + int64_t stride_a, const double *b, int64_t ldb, int64_t stride_b, + double beta, double *c, int64_t ldc, int64_t stride_c, + int64_t batch_size, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char transa_ = *fortran_char(transa); + const char transb_ = *fortran_char(transb); + MKL_INT one = 1; + host_task(cgh, [=]() { + double **a_array = (double **)::malloc(sizeof(double *) * batch_size); + double **b_array = (double **)::malloc(sizeof(double *) * batch_size); + double **c_array = (double **)::malloc(sizeof(double *) * batch_size); + if ((a_array == NULL) || (b_array == NULL) || (c_array == NULL)) { + std::cout << "Error cannot allocate input arrays\n"; + ::free(a_array); + ::free(b_array); + ::free(c_array); + return; + } + for (int64_t i = 0; i < batch_size; i++) { + if (i == 0) { + a_array[0] = (double *)a; + b_array[0] = (double *)b; + c_array[0] = (double *)c; + } + else { + a_array[i] = a_array[i - 1] + stride_a; + b_array[i] = b_array[i - 1] + stride_b; + c_array[i] = c_array[i - 1] + stride_c; + } + } + ::dgemm_batch(&transa_, &transb_, (const MKL_INT *)&m, (const MKL_INT *)&n, + (const MKL_INT *)&k, &alpha, (const double **)a_array, + (const MKL_INT *)&lda, (const double **)b_array, (const MKL_INT *)&ldb, + &beta, c_array, (const MKL_INT *)&ldc, (const MKL_INT *)&one, + (const MKL_INT *)&batch_size); + + ::free(a_array); + ::free(b_array); + ::free(c_array); + }); + }); + return done; +} + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m, + int64_t n, int64_t k, std::complex alpha, + const std::complex *a, int64_t lda, int64_t stride_a, + const std::complex *b, int64_t ldb, int64_t stride_b, + std::complex beta, std::complex *c, int64_t ldc, + int64_t stride_c, int64_t batch_size, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char transa_ = *fortran_char(transa); + const char transb_ = *fortran_char(transb); + MKL_INT one = 1; + host_task(cgh, [=]() { + std::complex **a_array = + (std::complex **)::malloc(sizeof(std::complex *) * batch_size); + std::complex **b_array = + (std::complex **)::malloc(sizeof(std::complex *) * batch_size); + std::complex **c_array = + (std::complex **)::malloc(sizeof(std::complex *) * batch_size); + if ((a_array == NULL) || (b_array == NULL) || (c_array == NULL)) { + std::cout << "Error cannot allocate input arrays\n"; + ::free(a_array); + ::free(b_array); + ::free(c_array); + return; + } + for (int64_t i = 0; i < batch_size; i++) { + if (i == 0) { + a_array[0] = (std::complex *)a; + b_array[0] = (std::complex *)b; + c_array[0] = (std::complex *)c; + } + else { + a_array[i] = a_array[i - 1] + stride_a; + b_array[i] = b_array[i - 1] + stride_b; + c_array[i] = c_array[i - 1] + stride_c; + } + } + ::cgemm_batch(&transa_, &transb_, (const MKL_INT *)&m, (const MKL_INT *)&n, + (const MKL_INT *)&k, &alpha, (const std::complex **)a_array, + (const MKL_INT *)&lda, (const std::complex **)b_array, + (const MKL_INT *)&ldb, &beta, c_array, (const MKL_INT *)&ldc, + (const MKL_INT *)&one, (const MKL_INT *)&batch_size); + + ::free(a_array); + ::free(b_array); + ::free(c_array); + }); + }); + return done; +} + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m, + int64_t n, int64_t k, std::complex alpha, + const std::complex *a, int64_t lda, int64_t stride_a, + const std::complex *b, int64_t ldb, int64_t stride_b, + std::complex beta, std::complex *c, int64_t ldc, + int64_t stride_c, int64_t batch_size, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char transa_ = *fortran_char(transa); + const char transb_ = *fortran_char(transb); + MKL_INT one = 1; + host_task(cgh, [=]() { + std::complex **a_array = + (std::complex **)::malloc(sizeof(std::complex *) * batch_size); + std::complex **b_array = + (std::complex **)::malloc(sizeof(std::complex *) * batch_size); + std::complex **c_array = + (std::complex **)::malloc(sizeof(std::complex *) * batch_size); + if ((a_array == NULL) || (b_array == NULL) || (c_array == NULL)) { + std::cout << "Error cannot allocate input arrays\n"; + ::free(a_array); + ::free(b_array); + ::free(c_array); + return; + } + for (int64_t i = 0; i < batch_size; i++) { + if (i == 0) { + a_array[0] = (std::complex *)a; + b_array[0] = (std::complex *)b; + c_array[0] = (std::complex *)c; + } + else { + a_array[i] = a_array[i - 1] + stride_a; + b_array[i] = b_array[i - 1] + stride_b; + c_array[i] = c_array[i - 1] + stride_c; + } + } + ::zgemm_batch(&transa_, &transb_, (const MKL_INT *)&m, (const MKL_INT *)&n, + (const MKL_INT *)&k, &alpha, (const std::complex **)a_array, + (const MKL_INT *)&lda, (const std::complex **)b_array, + (const MKL_INT *)&ldb, &beta, c_array, (const MKL_INT *)&ldc, + (const MKL_INT *)&one, (const MKL_INT *)&batch_size); + + ::free(a_array); + ::free(b_array); + ::free(c_array); + }); + }); + return done; +} + +cl::sycl::event axpy_batch(cl::sycl::queue &queue, int64_t *n, float *alpha, const float **x, + int64_t *incx, float **y, int64_t *incy, int64_t group_count, + int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + int64_t offset = 0; + for (int64_t i = 0; i < group_count; i++) { + for (int64_t j = 0; j < group_size[i]; j++) { + ::saxpy((const MKL_INT *)(n + i), (const float *)(alpha + i), x[offset + j], + (const MKL_INT *)(incx + i), y[offset + j], + (const MKL_INT *)(incy + i)); + } + offset += group_size[i]; + } + }); + }); + return done; +} + +cl::sycl::event axpy_batch(cl::sycl::queue &queue, int64_t *n, double *alpha, const double **x, + int64_t *incx, double **y, int64_t *incy, int64_t group_count, + int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + int64_t offset = 0; + for (int64_t i = 0; i < group_count; i++) { + for (int64_t j = 0; j < group_size[i]; j++) { + ::daxpy((const MKL_INT *)(n + i), (const double *)(alpha + i), x[offset + j], + (const MKL_INT *)(incx + i), y[offset + j], + (const MKL_INT *)(incy + i)); + } + offset += group_size[i]; + } + }); + }); + return done; +} + +cl::sycl::event axpy_batch(cl::sycl::queue &queue, int64_t *n, std::complex *alpha, + const std::complex **x, int64_t *incx, std::complex **y, + int64_t *incy, int64_t group_count, int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + int64_t offset = 0; + for (int64_t i = 0; i < group_count; i++) { + for (int64_t j = 0; j < group_size[i]; j++) { + MKL_Complex8 alpha_ = { alpha[i].real(), alpha[i].imag() }; + ::caxpy((const MKL_INT *)(n + i), (const MKL_Complex8 *)&alpha_, x[offset + j], + (const MKL_INT *)(incx + i), y[offset + j], + (const MKL_INT *)(incy + i)); + } + offset += group_size[i]; + } + }); + }); + return done; +} + +cl::sycl::event axpy_batch(cl::sycl::queue &queue, int64_t *n, std::complex *alpha, + const std::complex **x, int64_t *incx, std::complex **y, + int64_t *incy, int64_t group_count, int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + int64_t offset = 0; + for (int64_t i = 0; i < group_count; i++) { + for (int64_t j = 0; j < group_size[i]; j++) { + MKL_Complex16 alpha_ = { alpha[i].real(), alpha[i].imag() }; + ::zaxpy((const MKL_INT *)(n + i), (const MKL_Complex16 *)&alpha_, x[offset + j], + (const MKL_INT *)(incx + i), y[offset + j], + (const MKL_INT *)(incy + i)); + } + offset += group_size[i]; + } + }); + }); + return done; +} + } // namespace mklcpu } // namespace onemkl diff --git a/src/blas/backends/mklcpu/cpu_extensions.cpp b/src/blas/backends/mklcpu/cpu_extensions.cpp index dc0d557bb..378b8abc8 100644 --- a/src/blas/backends/mklcpu/cpu_extensions.cpp +++ b/src/blas/backends/mklcpu/cpu_extensions.cpp @@ -79,6 +79,8 @@ static inline void copy_mat(T_src &src, int64_t row, int64_t col, int64_t ld, of } } +// Buffer APIs + void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, half alpha, cl::sycl::buffer &a, int64_t lda, cl::sycl::buffer &b, int64_t ldb, half beta, cl::sycl::buffer &c, @@ -309,5 +311,105 @@ void gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose }); } +// USM APIs + +cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, float alpha, const float *a, int64_t lda, + const float *b, int64_t ldb, float beta, float *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char transa_ = *fortran_char(transa); + const char transb_ = *fortran_char(transb); + host_task(cgh, [=]() { + ::sgemmt((const char *)&upper_lower_, (const char *)&transa_, (const char *)&transb_, + (const MKL_INT *)&n, (const MKL_INT *)&k, (const float *)&alpha, a, + (const MKL_INT *)&lda, b, (const MKL_INT *)&ldb, (const float *)&beta, c, + (const MKL_INT *)&ldc); + }); + }); + return done; +} + +cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, double alpha, const double *a, int64_t lda, + const double *b, int64_t ldb, double beta, double *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char transa_ = *fortran_char(transa); + const char transb_ = *fortran_char(transb); + host_task(cgh, [=]() { + ::dgemmt((const char *)&upper_lower_, (const char *)&transa_, (const char *)&transb_, + (const MKL_INT *)&n, (const MKL_INT *)&k, (const double *)&alpha, a, + (const MKL_INT *)&lda, b, (const MKL_INT *)&ldb, (const double *)&beta, c, + (const MKL_INT *)&ldc); + }); + }); + return done; +} + +cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, std::complex alpha, const std::complex *a, + int64_t lda, const std::complex *b, int64_t ldb, + std::complex beta, std::complex *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char transa_ = *fortran_char(transa); + const char transb_ = *fortran_char(transb); + float alpha_real = alpha.real(), alpha_imag = alpha.imag(); + float beta_real = beta.real(), beta_imag = beta.imag(); + host_task(cgh, [=]() { + MKL_Complex8 alpha_ = { alpha_real, alpha_imag }; + MKL_Complex8 beta_ = { beta_real, beta_imag }; + ::cgemmt((const char *)&upper_lower_, (const char *)&transa_, (const char *)&transb_, + (const MKL_INT *)&n, (const MKL_INT *)&k, (const MKL_Complex8 *)&alpha_, a, + (const MKL_INT *)&lda, b, (const MKL_INT *)&ldb, (const MKL_Complex8 *)&beta_, + c, (const MKL_INT *)&ldc); + }); + }); + return done; +} + +cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, std::complex alpha, + const std::complex *a, int64_t lda, const std::complex *b, + int64_t ldb, std::complex beta, std::complex *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char transa_ = *fortran_char(transa); + const char transb_ = *fortran_char(transb); + double alpha_real = alpha.real(), alpha_imag = alpha.imag(); + double beta_real = beta.real(), beta_imag = beta.imag(); + host_task(cgh, [=]() { + MKL_Complex16 alpha_ = { alpha_real, alpha_imag }; + MKL_Complex16 beta_ = { beta_real, beta_imag }; + ::zgemmt((const char *)&upper_lower_, (const char *)&transa_, (const char *)&transb_, + (const MKL_INT *)&n, (const MKL_INT *)&k, (const MKL_Complex16 *)&alpha_, a, + (const MKL_INT *)&lda, b, (const MKL_INT *)&ldb, (const MKL_Complex16 *)&beta_, + c, (const MKL_INT *)&ldc); + }); + }); + return done; +} + } // namespace mklcpu } // namespace onemkl diff --git a/src/blas/backends/mklcpu/cpu_level1.cpp b/src/blas/backends/mklcpu/cpu_level1.cpp index a20f6d2ec..205e6601d 100644 --- a/src/blas/backends/mklcpu/cpu_level1.cpp +++ b/src/blas/backends/mklcpu/cpu_level1.cpp @@ -25,6 +25,8 @@ namespace onemkl { namespace mklcpu { +// Buffer APIs + void asum(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, int64_t incx, cl::sycl::buffer &result) { queue.submit([&](cl::sycl::handler &cgh) { @@ -712,5 +714,823 @@ void swap(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = ::sasum((const MKL_INT *)&n, x, (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event asum(cl::sycl::queue &queue, int64_t n, const double *x, int64_t incx, + double *result, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = ::dasum((const MKL_INT *)&n, x, (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event asum(cl::sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, + float *result, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = ::scasum((const MKL_INT *)&n, x, (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event asum(cl::sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, + double *result, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = ::dzasum((const MKL_INT *)&n, x, (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event axpy(cl::sycl::queue &queue, int64_t n, float alpha, const float *x, int64_t incx, + float *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::saxpy((const MKL_INT *)&n, (const float *)&alpha, x, (const MKL_INT *)&incx, y, + (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event axpy(cl::sycl::queue &queue, int64_t n, double alpha, const double *x, int64_t incx, + double *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::daxpy((const MKL_INT *)&n, (const double *)&alpha, x, (const MKL_INT *)&incx, y, + (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event axpy(cl::sycl::queue &queue, int64_t n, std::complex alpha, + const std::complex *x, int64_t incx, std::complex *y, + int64_t incy, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + float alpha_real = alpha.real(), alpha_imag = alpha.imag(); + host_task(cgh, [=]() { + MKL_Complex8 alpha_ = { alpha_real, alpha_imag }; + ::caxpy((const MKL_INT *)&n, (const MKL_Complex8 *)&alpha_, x, (const MKL_INT *)&incx, + y, (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event axpy(cl::sycl::queue &queue, int64_t n, std::complex alpha, + const std::complex *x, int64_t incx, std::complex *y, + int64_t incy, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + double alpha_real = alpha.real(), alpha_imag = alpha.imag(); + host_task(cgh, [=]() { + MKL_Complex16 alpha_ = { alpha_real, alpha_imag }; + ::zaxpy((const MKL_INT *)&n, (const MKL_Complex16 *)&alpha_, x, (const MKL_INT *)&incx, + y, (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event copy(cl::sycl::queue &queue, int64_t n, const float *x, int64_t incx, float *y, + int64_t incy, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::scopy((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event copy(cl::sycl::queue &queue, int64_t n, const double *x, int64_t incx, double *y, + int64_t incy, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::dcopy((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event copy(cl::sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, + std::complex *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::ccopy((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event copy(cl::sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, + std::complex *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::zcopy((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event dot(cl::sycl::queue &queue, int64_t n, const float *x, int64_t incx, const float *y, + int64_t incy, float *result, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = + ::sdot((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event dot(cl::sycl::queue &queue, int64_t n, const double *x, int64_t incx, + const double *y, int64_t incy, double *result, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = + ::ddot((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event dot(cl::sycl::queue &queue, int64_t n, const float *x, int64_t incx, const float *y, + int64_t incy, double *result, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = + ::dsdot((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event dotc(cl::sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, + const std::complex *y, int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cdotc(result, (const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, + (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event dotc(cl::sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, + const std::complex *y, int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::zdotc(result, (const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, + (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event dotu(cl::sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, + const std::complex *y, int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cdotu(result, (const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, + (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event dotu(cl::sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, + const std::complex *y, int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::zdotu(result, (const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, + (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event iamin(cl::sycl::queue &queue, int64_t n, const float *x, int64_t incx, + int64_t *result, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = ::cblas_isamin((MKL_INT)n, x, (MKL_INT)incx); + }); + }); + return done; +} + +cl::sycl::event iamin(cl::sycl::queue &queue, int64_t n, const double *x, int64_t incx, + int64_t *result, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = ::cblas_idamin((const MKL_INT)n, x, (const MKL_INT)incx); + }); + }); + return done; +} + +cl::sycl::event iamin(cl::sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, + int64_t *result, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = ::cblas_icamin((MKL_INT)n, x, (MKL_INT)incx); + }); + }); + return done; +} + +cl::sycl::event iamin(cl::sycl::queue &queue, int64_t n, const std::complex *x, + int64_t incx, int64_t *result, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = ::cblas_izamin((MKL_INT)n, x, (MKL_INT)incx); + }); + }); + return done; +} + +cl::sycl::event iamax(cl::sycl::queue &queue, int64_t n, const float *x, int64_t incx, + int64_t *result, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = ::cblas_isamax((MKL_INT)n, x, (MKL_INT)incx); + }); + }); + return done; +} + +cl::sycl::event iamax(cl::sycl::queue &queue, int64_t n, const double *x, int64_t incx, + int64_t *result, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = ::cblas_idamax((MKL_INT)n, x, (MKL_INT)incx); + }); + }); + return done; +} + +cl::sycl::event iamax(cl::sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, + int64_t *result, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = ::cblas_icamax((MKL_INT)n, x, (MKL_INT)incx); + }); + }); + return done; +} + +cl::sycl::event iamax(cl::sycl::queue &queue, int64_t n, const std::complex *x, + int64_t incx, int64_t *result, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = ::cblas_izamax((MKL_INT)n, x, (MKL_INT)incx); + }); + }); + return done; +} + +cl::sycl::event nrm2(cl::sycl::queue &queue, int64_t n, const float *x, int64_t incx, float *result, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = ::snrm2((const MKL_INT *)&n, x, (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event nrm2(cl::sycl::queue &queue, int64_t n, const double *x, int64_t incx, + double *result, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = ::dnrm2((const MKL_INT *)&n, x, (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event nrm2(cl::sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, + float *result, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = ::scnrm2((const MKL_INT *)&n, x, (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event nrm2(cl::sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, + double *result, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = ::dznrm2((const MKL_INT *)&n, x, (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event rot(cl::sycl::queue &queue, int64_t n, float *x, int64_t incx, float *y, + int64_t incy, float c, float s, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::srot((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, &c, + &s); + }); + }); + return done; +} + +cl::sycl::event rot(cl::sycl::queue &queue, int64_t n, double *x, int64_t incx, double *y, + int64_t incy, double c, double s, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::drot((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, &c, + &s); + }); + }); + return done; +} + +cl::sycl::event rot(cl::sycl::queue &queue, int64_t n, std::complex *x, int64_t incx, + std::complex *y, int64_t incy, float c, float s, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::csrot((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, &c, + &s); + }); + }); + return done; +} + +cl::sycl::event rot(cl::sycl::queue &queue, int64_t n, std::complex *x, int64_t incx, + std::complex *y, int64_t incy, double c, double s, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::zdrot((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, &c, + &s); + }); + }); + return done; +} + +cl::sycl::event rotg(cl::sycl::queue &queue, float *a, float *b, float *c, float *s, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::srotg(a, b, c, s); + }); + }); + return done; +} + +cl::sycl::event rotg(cl::sycl::queue &queue, double *a, double *b, double *c, double *s, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::drotg(a, b, c, s); + }); + }); + return done; +} + +cl::sycl::event rotg(cl::sycl::queue &queue, std::complex *a, std::complex *b, + float *c, std::complex *s, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::crotg(a, b, c, s); + }); + }); + return done; +} + +cl::sycl::event rotg(cl::sycl::queue &queue, std::complex *a, std::complex *b, + double *c, std::complex *s, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::zrotg(a, b, c, s); + }); + }); + return done; +} + +cl::sycl::event rotm(cl::sycl::queue &queue, int64_t n, float *x, int64_t incx, float *y, + int64_t incy, float *param, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::srotm((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, + param); + }); + }); + return done; +} + +cl::sycl::event rotm(cl::sycl::queue &queue, int64_t n, double *x, int64_t incx, double *y, + int64_t incy, double *param, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::drotm((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, + param); + }); + }); + return done; +} + +cl::sycl::event rotmg(cl::sycl::queue &queue, float *d1, float *d2, float *x1, float y1, + float *param, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::srotmg(d1, d2, x1, (float *)&y1, param); + }); + }); + return done; +} + +cl::sycl::event rotmg(cl::sycl::queue &queue, double *d1, double *d2, double *x1, double y1, + double *param, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::drotmg(d1, d2, x1, (double *)&y1, param); + }); + }); + return done; +} + +cl::sycl::event scal(cl::sycl::queue &queue, int64_t n, float alpha, float *x, int64_t incx, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::sscal((const MKL_INT *)&n, (const float *)&alpha, x, (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event scal(cl::sycl::queue &queue, int64_t n, double alpha, double *x, int64_t incx, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::dscal((const MKL_INT *)&n, (const double *)&alpha, x, (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event scal(cl::sycl::queue &queue, int64_t n, std::complex alpha, + std::complex *x, int64_t incx, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + float alpha_real = alpha.real(), alpha_imag = alpha.imag(); + host_task(cgh, [=]() { + MKL_Complex8 alpha_ = { alpha_real, alpha_imag }; + ::cscal((const MKL_INT *)&n, (const MKL_Complex8 *)&alpha_, x, (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event scal(cl::sycl::queue &queue, int64_t n, float alpha, std::complex *x, + int64_t incx, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::csscal((const MKL_INT *)&n, (const float *)&alpha, x, (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event scal(cl::sycl::queue &queue, int64_t n, std::complex alpha, + std::complex *x, int64_t incx, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + double alpha_real = alpha.real(), alpha_imag = alpha.imag(); + host_task(cgh, [=]() { + MKL_Complex16 alpha_ = { alpha_real, alpha_imag }; + ::zscal((const MKL_INT *)&n, (const MKL_Complex16 *)&alpha_, x, (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event scal(cl::sycl::queue &queue, int64_t n, double alpha, std::complex *x, + int64_t incx, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::zdscal((const MKL_INT *)&n, (const double *)&alpha, x, (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event sdsdot(cl::sycl::queue &queue, int64_t n, float sb, const float *x, int64_t incx, + const float *y, int64_t incy, float *result, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + result[0] = ::sdsdot((const MKL_INT *)&n, (const float *)&sb, x, (const MKL_INT *)&incx, + y, (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event swap(cl::sycl::queue &queue, int64_t n, float *x, int64_t incx, float *y, + int64_t incy, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::sswap((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event swap(cl::sycl::queue &queue, int64_t n, double *x, int64_t incx, double *y, + int64_t incy, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::dswap((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event swap(cl::sycl::queue &queue, int64_t n, std::complex *x, int64_t incx, + std::complex *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::cswap((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event swap(cl::sycl::queue &queue, int64_t n, std::complex *x, int64_t incx, + std::complex *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::zswap((const MKL_INT *)&n, x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy); + }); + }); + return done; +} + } // namespace mklcpu } // namespace onemkl diff --git a/src/blas/backends/mklcpu/cpu_level2.cpp b/src/blas/backends/mklcpu/cpu_level2.cpp index 0a1197c4e..dc81936b5 100644 --- a/src/blas/backends/mklcpu/cpu_level2.cpp +++ b/src/blas/backends/mklcpu/cpu_level2.cpp @@ -25,6 +25,8 @@ namespace onemkl { namespace mklcpu { +// Buffer APIs + void gbmv(cl::sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, float alpha, cl::sycl::buffer &a, int64_t lda, cl::sycl::buffer &x, int64_t incx, float beta, cl::sycl::buffer &y, int64_t incy) { @@ -1175,5 +1177,1302 @@ void trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_d }); } +// USM APIs + +cl::sycl::event gbmv(cl::sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, + int64_t ku, float alpha, const float *a, int64_t lda, const float *x, + int64_t incx, float beta, float *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char trans_ = *fortran_char(trans); + host_task(cgh, [=]() { + ::sgbmv((const char *)&trans_, (const MKL_INT *)&m, (const MKL_INT *)&n, + (const MKL_INT *)&kl, (const MKL_INT *)&ku, (const float *)&alpha, a, + (const MKL_INT *)&lda, x, (const MKL_INT *)&incx, (const float *)&beta, y, + (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event gbmv(cl::sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, + int64_t ku, double alpha, const double *a, int64_t lda, const double *x, + int64_t incx, double beta, double *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char trans_ = *fortran_char(trans); + host_task(cgh, [=]() { + ::dgbmv((const char *)&trans_, (const MKL_INT *)&m, (const MKL_INT *)&n, + (const MKL_INT *)&kl, (const MKL_INT *)&ku, (const double *)&alpha, a, + (const MKL_INT *)&lda, x, (const MKL_INT *)&incx, (const double *)&beta, y, + (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event gbmv(cl::sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, + int64_t ku, std::complex alpha, const std::complex *a, + int64_t lda, const std::complex *x, int64_t incx, + std::complex beta, std::complex *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char trans_ = *fortran_char(trans); + float alpha_real = alpha.real(), alpha_imag = alpha.imag(); + float beta_real = beta.real(), beta_imag = beta.imag(); + host_task(cgh, [=]() { + MKL_Complex8 alpha_ = { alpha_real, alpha_imag }; + MKL_Complex8 beta_ = { beta_real, beta_imag }; + ::cgbmv((const char *)&trans_, (const MKL_INT *)&m, (const MKL_INT *)&n, + (const MKL_INT *)&kl, (const MKL_INT *)&ku, (const MKL_Complex8 *)&alpha_, a, + (const MKL_INT *)&lda, x, (const MKL_INT *)&incx, (const MKL_Complex8 *)&beta_, + y, (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event gbmv(cl::sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, + int64_t ku, std::complex alpha, const std::complex *a, + int64_t lda, const std::complex *x, int64_t incx, + std::complex beta, std::complex *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char trans_ = *fortran_char(trans); + double alpha_real = alpha.real(), alpha_imag = alpha.imag(); + double beta_real = beta.real(), beta_imag = beta.imag(); + host_task(cgh, [=]() { + MKL_Complex16 alpha_ = { alpha_real, alpha_imag }; + MKL_Complex16 beta_ = { beta_real, beta_imag }; + ::zgbmv((const char *)&trans_, (const MKL_INT *)&m, (const MKL_INT *)&n, + (const MKL_INT *)&kl, (const MKL_INT *)&ku, (const MKL_Complex16 *)&alpha_, a, + (const MKL_INT *)&lda, x, (const MKL_INT *)&incx, (const MKL_Complex16 *)&beta_, + y, (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event gemv(cl::sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, + const float *a, int64_t lda, const float *x, int64_t incx, float beta, + float *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char trans_ = *fortran_char(trans); + host_task(cgh, [=]() { + ::sgemv((const char *)&trans_, (const MKL_INT *)&m, (const MKL_INT *)&n, + (const float *)&alpha, a, (const MKL_INT *)&lda, x, (const MKL_INT *)&incx, + (const float *)&beta, y, (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event gemv(cl::sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, + const double *a, int64_t lda, const double *x, int64_t incx, double beta, + double *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char trans_ = *fortran_char(trans); + host_task(cgh, [=]() { + ::dgemv((const char *)&trans_, (const MKL_INT *)&m, (const MKL_INT *)&n, + (const double *)&alpha, a, (const MKL_INT *)&lda, x, (const MKL_INT *)&incx, + (const double *)&beta, y, (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event gemv(cl::sycl::queue &queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex *a, int64_t lda, + const std::complex *x, int64_t incx, std::complex beta, + std::complex *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char trans_ = *fortran_char(trans); + float alpha_real = alpha.real(), alpha_imag = alpha.imag(); + float beta_real = beta.real(), beta_imag = beta.imag(); + host_task(cgh, [=]() { + MKL_Complex8 alpha_ = { alpha_real, alpha_imag }; + MKL_Complex8 beta_ = { beta_real, beta_imag }; + ::cgemv((const char *)&trans_, (const MKL_INT *)&m, (const MKL_INT *)&n, + (const MKL_Complex8 *)&alpha_, a, (const MKL_INT *)&lda, x, + (const MKL_INT *)&incx, (const MKL_Complex8 *)&beta_, y, + (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event gemv(cl::sycl::queue &queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex *a, int64_t lda, + const std::complex *x, int64_t incx, std::complex beta, + std::complex *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char trans_ = *fortran_char(trans); + double alpha_real = alpha.real(), alpha_imag = alpha.imag(); + double beta_real = beta.real(), beta_imag = beta.imag(); + host_task(cgh, [=]() { + MKL_Complex16 alpha_ = { alpha_real, alpha_imag }; + MKL_Complex16 beta_ = { beta_real, beta_imag }; + ::zgemv((const char *)&trans_, (const MKL_INT *)&m, (const MKL_INT *)&n, + (const MKL_Complex16 *)&alpha_, a, (const MKL_INT *)&lda, x, + (const MKL_INT *)&incx, (const MKL_Complex16 *)&beta_, y, + (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event ger(cl::sycl::queue &queue, int64_t m, int64_t n, float alpha, const float *x, + int64_t incx, const float *y, int64_t incy, float *a, int64_t lda, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::sger((const MKL_INT *)&m, (const MKL_INT *)&n, (const float *)&alpha, x, + (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, a, (const MKL_INT *)&lda); + }); + }); + return done; +} + +cl::sycl::event ger(cl::sycl::queue &queue, int64_t m, int64_t n, double alpha, const double *x, + int64_t incx, const double *y, int64_t incy, double *a, int64_t lda, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + host_task(cgh, [=]() { + ::dger((const MKL_INT *)&m, (const MKL_INT *)&n, (const double *)&alpha, x, + (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, a, (const MKL_INT *)&lda); + }); + }); + return done; +} + +cl::sycl::event gerc(cl::sycl::queue &queue, int64_t m, int64_t n, std::complex alpha, + const std::complex *x, int64_t incx, const std::complex *y, + int64_t incy, std::complex *a, int64_t lda, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + float alpha_real = alpha.real(), alpha_imag = alpha.imag(); + host_task(cgh, [=]() { + MKL_Complex8 alpha_ = { alpha_real, alpha_imag }; + ::cgerc((const MKL_INT *)&m, (const MKL_INT *)&n, (const MKL_Complex8 *)&alpha_, x, + (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, a, (const MKL_INT *)&lda); + }); + }); + return done; +} + +cl::sycl::event gerc(cl::sycl::queue &queue, int64_t m, int64_t n, std::complex alpha, + const std::complex *x, int64_t incx, const std::complex *y, + int64_t incy, std::complex *a, int64_t lda, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + double alpha_real = alpha.real(), alpha_imag = alpha.imag(); + host_task(cgh, [=]() { + MKL_Complex16 alpha_ = { alpha_real, alpha_imag }; + ::zgerc((const MKL_INT *)&m, (const MKL_INT *)&n, (const MKL_Complex16 *)&alpha_, x, + (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, a, (const MKL_INT *)&lda); + }); + }); + return done; +} + +cl::sycl::event geru(cl::sycl::queue &queue, int64_t m, int64_t n, std::complex alpha, + const std::complex *x, int64_t incx, const std::complex *y, + int64_t incy, std::complex *a, int64_t lda, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + float alpha_real = alpha.real(), alpha_imag = alpha.imag(); + host_task(cgh, [=]() { + MKL_Complex8 alpha_ = { alpha_real, alpha_imag }; + ::cgeru((const MKL_INT *)&m, (const MKL_INT *)&n, (const MKL_Complex8 *)&alpha_, x, + (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, a, (const MKL_INT *)&lda); + }); + }); + return done; +} + +cl::sycl::event geru(cl::sycl::queue &queue, int64_t m, int64_t n, std::complex alpha, + const std::complex *x, int64_t incx, const std::complex *y, + int64_t incy, std::complex *a, int64_t lda, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + double alpha_real = alpha.real(), alpha_imag = alpha.imag(); + host_task(cgh, [=]() { + MKL_Complex16 alpha_ = { alpha_real, alpha_imag }; + ::zgeru((const MKL_INT *)&m, (const MKL_INT *)&n, (const MKL_Complex16 *)&alpha_, x, + (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, a, (const MKL_INT *)&lda); + }); + }); + return done; +} + +cl::sycl::event hbmv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, + std::complex alpha, const std::complex *a, int64_t lda, + const std::complex *x, int64_t incx, std::complex beta, + std::complex *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + float alpha_real = alpha.real(), alpha_imag = alpha.imag(); + float beta_real = beta.real(), beta_imag = beta.imag(); + host_task(cgh, [=]() { + MKL_Complex8 alpha_ = { alpha_real, alpha_imag }; + MKL_Complex8 beta_ = { beta_real, beta_imag }; + ::chbmv((const char *)&upper_lower_, (const MKL_INT *)&n, (const MKL_INT *)&k, + (const MKL_Complex8 *)&alpha_, a, (const MKL_INT *)&lda, x, + (const MKL_INT *)&incx, (const MKL_Complex8 *)&beta_, y, + (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event hbmv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, + std::complex alpha, const std::complex *a, int64_t lda, + const std::complex *x, int64_t incx, std::complex beta, + std::complex *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + double alpha_real = alpha.real(), alpha_imag = alpha.imag(); + double beta_real = beta.real(), beta_imag = beta.imag(); + host_task(cgh, [=]() { + MKL_Complex16 alpha_ = { alpha_real, alpha_imag }; + MKL_Complex16 beta_ = { beta_real, beta_imag }; + ::zhbmv((const char *)&upper_lower_, (const MKL_INT *)&n, (const MKL_INT *)&k, + (const MKL_Complex16 *)&alpha_, a, (const MKL_INT *)&lda, x, + (const MKL_INT *)&incx, (const MKL_Complex16 *)&beta_, y, + (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event hemv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, + const std::complex *a, int64_t lda, const std::complex *x, + int64_t incx, std::complex beta, std::complex *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + float alpha_real = alpha.real(), alpha_imag = alpha.imag(); + float beta_real = beta.real(), beta_imag = beta.imag(); + host_task(cgh, [=]() { + MKL_Complex8 alpha_ = { alpha_real, alpha_imag }; + MKL_Complex8 beta_ = { beta_real, beta_imag }; + ::chemv((const char *)&upper_lower_, (const MKL_INT *)&n, (const MKL_Complex8 *)&alpha_, + a, (const MKL_INT *)&lda, x, (const MKL_INT *)&incx, + (const MKL_Complex8 *)&beta_, y, (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event hemv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, + std::complex alpha, const std::complex *a, int64_t lda, + const std::complex *x, int64_t incx, std::complex beta, + std::complex *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + double alpha_real = alpha.real(), alpha_imag = alpha.imag(); + double beta_real = beta.real(), beta_imag = beta.imag(); + host_task(cgh, [=]() { + MKL_Complex16 alpha_ = { alpha_real, alpha_imag }; + MKL_Complex16 beta_ = { beta_real, beta_imag }; + ::zhemv((const char *)&upper_lower_, (const MKL_INT *)&n, + (const MKL_Complex16 *)&alpha_, a, (const MKL_INT *)&lda, x, + (const MKL_INT *)&incx, (const MKL_Complex16 *)&beta_, y, + (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event her(cl::sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, + const std::complex *x, int64_t incx, std::complex *a, int64_t lda, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + host_task(cgh, [=]() { + ::cher((const char *)&upper_lower_, (const MKL_INT *)&n, (const float *)&alpha, x, + (const MKL_INT *)&incx, a, (const MKL_INT *)&lda); + }); + }); + return done; +} + +cl::sycl::event her(cl::sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, + const std::complex *x, int64_t incx, std::complex *a, + int64_t lda, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + host_task(cgh, [=]() { + ::zher((const char *)&upper_lower_, (const MKL_INT *)&n, (const double *)&alpha, x, + (const MKL_INT *)&incx, a, (const MKL_INT *)&lda); + }); + }); + return done; +} + +cl::sycl::event her2(cl::sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, + const std::complex *x, int64_t incx, const std::complex *y, + int64_t incy, std::complex *a, int64_t lda, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + float alpha_real = alpha.real(), alpha_imag = alpha.imag(); + host_task(cgh, [=]() { + MKL_Complex8 alpha_ = { alpha_real, alpha_imag }; + ::cher2((const char *)&upper_lower_, (const MKL_INT *)&n, (const MKL_Complex8 *)&alpha_, + x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, a, (const MKL_INT *)&lda); + }); + }); + return done; +} + +cl::sycl::event her2(cl::sycl::queue &queue, uplo upper_lower, int64_t n, + std::complex alpha, const std::complex *x, int64_t incx, + const std::complex *y, int64_t incy, std::complex *a, + int64_t lda, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + double alpha_real = alpha.real(), alpha_imag = alpha.imag(); + host_task(cgh, [=]() { + MKL_Complex16 alpha_ = { alpha_real, alpha_imag }; + ::zher2((const char *)&upper_lower_, (const MKL_INT *)&n, + (const MKL_Complex16 *)&alpha_, x, (const MKL_INT *)&incx, y, + (const MKL_INT *)&incy, a, (const MKL_INT *)&lda); + }); + }); + return done; +} + +cl::sycl::event hpmv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, + const std::complex *ap, const std::complex *x, int64_t incx, + std::complex beta, std::complex *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + float alpha_real = alpha.real(), alpha_imag = alpha.imag(); + float beta_real = beta.real(), beta_imag = beta.imag(); + host_task(cgh, [=]() { + MKL_Complex8 alpha_ = { alpha_real, alpha_imag }; + MKL_Complex8 beta_ = { beta_real, beta_imag }; + ::chpmv((const char *)&upper_lower_, (const MKL_INT *)&n, (const MKL_Complex8 *)&alpha_, + ap, x, (const MKL_INT *)&incx, (const MKL_Complex8 *)&beta_, y, + (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event hpmv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, + std::complex alpha, const std::complex *ap, + const std::complex *x, int64_t incx, std::complex beta, + std::complex *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + double alpha_real = alpha.real(), alpha_imag = alpha.imag(); + double beta_real = beta.real(), beta_imag = beta.imag(); + host_task(cgh, [=]() { + MKL_Complex16 alpha_ = { alpha_real, alpha_imag }; + MKL_Complex16 beta_ = { beta_real, beta_imag }; + ::zhpmv((const char *)&upper_lower_, (const MKL_INT *)&n, + (const MKL_Complex16 *)&alpha_, ap, x, (const MKL_INT *)&incx, + (const MKL_Complex16 *)&beta_, y, (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event hpr(cl::sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, + const std::complex *x, int64_t incx, std::complex *ap, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + host_task(cgh, [=]() { + ::chpr((const char *)&upper_lower_, (const MKL_INT *)&n, (const float *)&alpha, x, + (const MKL_INT *)&incx, ap); + }); + }); + return done; +} + +cl::sycl::event hpr(cl::sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, + const std::complex *x, int64_t incx, std::complex *ap, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + host_task(cgh, [=]() { + ::zhpr((const char *)&upper_lower_, (const MKL_INT *)&n, (const double *)&alpha, x, + (const MKL_INT *)&incx, ap); + }); + }); + return done; +} + +cl::sycl::event hpr2(cl::sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, + const std::complex *x, int64_t incx, const std::complex *y, + int64_t incy, std::complex *ap, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + float alpha_real = alpha.real(), alpha_imag = alpha.imag(); + host_task(cgh, [=]() { + MKL_Complex8 alpha_ = { alpha_real, alpha_imag }; + ::chpr2((const char *)&upper_lower_, (const MKL_INT *)&n, (const MKL_Complex8 *)&alpha_, + x, (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, ap); + }); + }); + return done; +} + +cl::sycl::event hpr2(cl::sycl::queue &queue, uplo upper_lower, int64_t n, + std::complex alpha, const std::complex *x, int64_t incx, + const std::complex *y, int64_t incy, std::complex *ap, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + double alpha_real = alpha.real(), alpha_imag = alpha.imag(); + host_task(cgh, [=]() { + MKL_Complex16 alpha_ = { alpha_real, alpha_imag }; + ::zhpr2((const char *)&upper_lower_, (const MKL_INT *)&n, + (const MKL_Complex16 *)&alpha_, x, (const MKL_INT *)&incx, y, + (const MKL_INT *)&incy, ap); + }); + }); + return done; +} + +cl::sycl::event sbmv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, float alpha, + const float *a, int64_t lda, const float *x, int64_t incx, float beta, + float *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + host_task(cgh, [=]() { + ::ssbmv((const char *)&upper_lower_, (const MKL_INT *)&n, (const MKL_INT *)&k, + (const float *)&alpha, a, (const MKL_INT *)&lda, x, (const MKL_INT *)&incx, + (const float *)&beta, y, (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event sbmv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, double alpha, + const double *a, int64_t lda, const double *x, int64_t incx, double beta, + double *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + host_task(cgh, [=]() { + ::dsbmv((const char *)&upper_lower_, (const MKL_INT *)&n, (const MKL_INT *)&k, + (const double *)&alpha, a, (const MKL_INT *)&lda, x, (const MKL_INT *)&incx, + (const double *)&beta, y, (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event spmv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, + const float *ap, const float *x, int64_t incx, float beta, float *y, + int64_t incy, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + host_task(cgh, [=]() { + ::sspmv((const char *)&upper_lower_, (const MKL_INT *)&n, (const float *)&alpha, ap, x, + (const MKL_INT *)&incx, (const float *)&beta, y, (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event spmv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, + const double *ap, const double *x, int64_t incx, double beta, double *y, + int64_t incy, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + host_task(cgh, [=]() { + ::dspmv((const char *)&upper_lower_, (const MKL_INT *)&n, (const double *)&alpha, ap, x, + (const MKL_INT *)&incx, (const double *)&beta, y, (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event spr(cl::sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, + const float *x, int64_t incx, float *ap, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + host_task(cgh, [=]() { + ::sspr((const char *)&upper_lower_, (const MKL_INT *)&n, (const float *)&alpha, x, + (const MKL_INT *)&incx, ap); + }); + }); + return done; +} + +cl::sycl::event spr(cl::sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, + const double *x, int64_t incx, double *ap, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + host_task(cgh, [=]() { + ::dspr((const char *)&upper_lower_, (const MKL_INT *)&n, (const double *)&alpha, x, + (const MKL_INT *)&incx, ap); + }); + }); + return done; +} + +cl::sycl::event spr2(cl::sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, + const float *x, int64_t incx, const float *y, int64_t incy, float *ap, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + host_task(cgh, [=]() { + ::sspr2((const char *)&upper_lower_, (const MKL_INT *)&n, (const float *)&alpha, x, + (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, ap); + }); + }); + return done; +} + +cl::sycl::event spr2(cl::sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, + const double *x, int64_t incx, const double *y, int64_t incy, double *ap, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + host_task(cgh, [=]() { + ::dspr2((const char *)&upper_lower_, (const MKL_INT *)&n, (const double *)&alpha, x, + (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, ap); + }); + }); + return done; +} + +cl::sycl::event symv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, + const float *a, int64_t lda, const float *x, int64_t incx, float beta, + float *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + host_task(cgh, [=]() { + ::ssymv((const char *)&upper_lower_, (const MKL_INT *)&n, (const float *)&alpha, a, + (const MKL_INT *)&lda, x, (const MKL_INT *)&incx, (const float *)&beta, y, + (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event symv(cl::sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, + const double *a, int64_t lda, const double *x, int64_t incx, double beta, + double *y, int64_t incy, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + host_task(cgh, [=]() { + ::dsymv((const char *)&upper_lower_, (const MKL_INT *)&n, (const double *)&alpha, a, + (const MKL_INT *)&lda, x, (const MKL_INT *)&incx, (const double *)&beta, y, + (const MKL_INT *)&incy); + }); + }); + return done; +} + +cl::sycl::event syr(cl::sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, + const float *x, int64_t incx, float *a, int64_t lda, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + host_task(cgh, [=]() { + ::ssyr((const char *)&upper_lower_, (const MKL_INT *)&n, (const float *)&alpha, x, + (const MKL_INT *)&incx, a, (const MKL_INT *)&lda); + }); + }); + return done; +} + +cl::sycl::event syr(cl::sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, + const double *x, int64_t incx, double *a, int64_t lda, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + host_task(cgh, [=]() { + ::dsyr((const char *)&upper_lower_, (const MKL_INT *)&n, (const double *)&alpha, x, + (const MKL_INT *)&incx, a, (const MKL_INT *)&lda); + }); + }); + return done; +} + +cl::sycl::event syr2(cl::sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, + const float *x, int64_t incx, const float *y, int64_t incy, float *a, + int64_t lda, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + host_task(cgh, [=]() { + ::ssyr2((const char *)&upper_lower_, (const MKL_INT *)&n, (const float *)&alpha, x, + (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, a, (const MKL_INT *)&lda); + }); + }); + return done; +} + +cl::sycl::event syr2(cl::sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, + const double *x, int64_t incx, const double *y, int64_t incy, double *a, + int64_t lda, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + host_task(cgh, [=]() { + ::dsyr2((const char *)&upper_lower_, (const MKL_INT *)&n, (const double *)&alpha, x, + (const MKL_INT *)&incx, y, (const MKL_INT *)&incy, a, (const MKL_INT *)&lda); + }); + }); + return done; +} + +cl::sycl::event tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, int64_t k, const float *a, int64_t lda, float *x, int64_t incx, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + const char unit_diag_ = *fortran_char(unit_diag); + host_task(cgh, [=]() { + ::stbmv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_, + (const MKL_INT *)&n, (const MKL_INT *)&k, a, (const MKL_INT *)&lda, x, + (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, int64_t k, const double *a, int64_t lda, double *x, int64_t incx, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + const char unit_diag_ = *fortran_char(unit_diag); + host_task(cgh, [=]() { + ::dtbmv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_, + (const MKL_INT *)&n, (const MKL_INT *)&k, a, (const MKL_INT *)&lda, x, + (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, int64_t k, const std::complex *a, int64_t lda, + std::complex *x, int64_t incx, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + const char unit_diag_ = *fortran_char(unit_diag); + host_task(cgh, [=]() { + ::ctbmv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_, + (const MKL_INT *)&n, (const MKL_INT *)&k, a, (const MKL_INT *)&lda, x, + (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event tbmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, int64_t k, const std::complex *a, int64_t lda, + std::complex *x, int64_t incx, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + const char unit_diag_ = *fortran_char(unit_diag); + host_task(cgh, [=]() { + ::ztbmv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_, + (const MKL_INT *)&n, (const MKL_INT *)&k, a, (const MKL_INT *)&lda, x, + (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, int64_t k, const float *a, int64_t lda, float *x, int64_t incx, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + const char unit_diag_ = *fortran_char(unit_diag); + host_task(cgh, [=]() { + ::stbsv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_, + (const MKL_INT *)&n, (const MKL_INT *)&k, a, (const MKL_INT *)&lda, x, + (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, int64_t k, const double *a, int64_t lda, double *x, int64_t incx, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + const char unit_diag_ = *fortran_char(unit_diag); + host_task(cgh, [=]() { + ::dtbsv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_, + (const MKL_INT *)&n, (const MKL_INT *)&k, a, (const MKL_INT *)&lda, x, + (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, int64_t k, const std::complex *a, int64_t lda, + std::complex *x, int64_t incx, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + const char unit_diag_ = *fortran_char(unit_diag); + host_task(cgh, [=]() { + ::ctbsv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_, + (const MKL_INT *)&n, (const MKL_INT *)&k, a, (const MKL_INT *)&lda, x, + (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event tbsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, int64_t k, const std::complex *a, int64_t lda, + std::complex *x, int64_t incx, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + const char unit_diag_ = *fortran_char(unit_diag); + host_task(cgh, [=]() { + ::ztbsv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_, + (const MKL_INT *)&n, (const MKL_INT *)&k, a, (const MKL_INT *)&lda, x, + (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, const float *ap, float *x, int64_t incx, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + const char unit_diag_ = *fortran_char(unit_diag); + host_task(cgh, [=]() { + ::stpmv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_, + (const MKL_INT *)&n, ap, x, (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, const double *ap, double *x, int64_t incx, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + const char unit_diag_ = *fortran_char(unit_diag); + host_task(cgh, [=]() { + ::dtpmv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_, + (const MKL_INT *)&n, ap, x, (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, const std::complex *ap, std::complex *x, int64_t incx, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + const char unit_diag_ = *fortran_char(unit_diag); + host_task(cgh, [=]() { + ::ctpmv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_, + (const MKL_INT *)&n, ap, x, (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event tpmv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, const std::complex *ap, std::complex *x, + int64_t incx, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + const char unit_diag_ = *fortran_char(unit_diag); + host_task(cgh, [=]() { + ::ztpmv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_, + (const MKL_INT *)&n, ap, x, (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, const float *ap, float *x, int64_t incx, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + const char unit_diag_ = *fortran_char(unit_diag); + host_task(cgh, [=]() { + ::stpsv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_, + (const MKL_INT *)&n, ap, x, (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, const double *ap, double *x, int64_t incx, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + const char unit_diag_ = *fortran_char(unit_diag); + host_task(cgh, [=]() { + ::dtpsv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_, + (const MKL_INT *)&n, ap, x, (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, const std::complex *ap, std::complex *x, int64_t incx, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + const char unit_diag_ = *fortran_char(unit_diag); + host_task(cgh, [=]() { + ::ctpsv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_, + (const MKL_INT *)&n, ap, x, (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event tpsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, const std::complex *ap, std::complex *x, + int64_t incx, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + const char unit_diag_ = *fortran_char(unit_diag); + host_task(cgh, [=]() { + ::ztpsv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_, + (const MKL_INT *)&n, ap, x, (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event trmv(cl::sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag, + int64_t n, const float *a, int64_t lda, float *b, int64_t incx, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char transa_ = *fortran_char(transa); + const char unit_diag_ = *fortran_char(unit_diag); + host_task(cgh, [=]() { + ::strmv((const char *)&upper_lower_, (const char *)&transa_, (const char *)&unit_diag_, + (const MKL_INT *)&n, a, (const MKL_INT *)&lda, b, (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event trmv(cl::sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag, + int64_t n, const double *a, int64_t lda, double *b, int64_t incx, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char transa_ = *fortran_char(transa); + const char unit_diag_ = *fortran_char(unit_diag); + host_task(cgh, [=]() { + ::dtrmv((const char *)&upper_lower_, (const char *)&transa_, (const char *)&unit_diag_, + (const MKL_INT *)&n, a, (const MKL_INT *)&lda, b, (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event trmv(cl::sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag, + int64_t n, const std::complex *a, int64_t lda, std::complex *b, + int64_t incx, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char transa_ = *fortran_char(transa); + const char unit_diag_ = *fortran_char(unit_diag); + host_task(cgh, [=]() { + ::ctrmv((const char *)&upper_lower_, (const char *)&transa_, (const char *)&unit_diag_, + (const MKL_INT *)&n, a, (const MKL_INT *)&lda, b, (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event trmv(cl::sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag, + int64_t n, const std::complex *a, int64_t lda, std::complex *b, + int64_t incx, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char transa_ = *fortran_char(transa); + const char unit_diag_ = *fortran_char(unit_diag); + host_task(cgh, [=]() { + ::ztrmv((const char *)&upper_lower_, (const char *)&transa_, (const char *)&unit_diag_, + (const MKL_INT *)&n, a, (const MKL_INT *)&lda, b, (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, const float *a, int64_t lda, float *x, int64_t incx, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + const char unit_diag_ = *fortran_char(unit_diag); + host_task(cgh, [=]() { + ::strsv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_, + (const MKL_INT *)&n, a, (const MKL_INT *)&lda, x, (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, const double *a, int64_t lda, double *x, int64_t incx, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + const char unit_diag_ = *fortran_char(unit_diag); + host_task(cgh, [=]() { + ::dtrsv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_, + (const MKL_INT *)&n, a, (const MKL_INT *)&lda, x, (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, const std::complex *a, int64_t lda, std::complex *x, + int64_t incx, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + const char unit_diag_ = *fortran_char(unit_diag); + host_task(cgh, [=]() { + ::ctrsv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_, + (const MKL_INT *)&n, a, (const MKL_INT *)&lda, x, (const MKL_INT *)&incx); + }); + }); + return done; +} + +cl::sycl::event trsv(cl::sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, const std::complex *a, int64_t lda, std::complex *x, + int64_t incx, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + const char unit_diag_ = *fortran_char(unit_diag); + host_task(cgh, [=]() { + ::ztrsv((const char *)&upper_lower_, (const char *)&trans_, (const char *)&unit_diag_, + (const MKL_INT *)&n, a, (const MKL_INT *)&lda, x, (const MKL_INT *)&incx); + }); + }); + return done; +} + } // namespace mklcpu } // namespace onemkl diff --git a/src/blas/backends/mklcpu/cpu_level3.cpp b/src/blas/backends/mklcpu/cpu_level3.cpp index d45eec31e..e3cb2b80e 100644 --- a/src/blas/backends/mklcpu/cpu_level3.cpp +++ b/src/blas/backends/mklcpu/cpu_level3.cpp @@ -25,6 +25,8 @@ namespace onemkl { namespace mklcpu { +// Buffer APIs + void gemm(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, float alpha, cl::sycl::buffer &a, int64_t lda, cl::sycl::buffer &b, int64_t ldb, float beta, cl::sycl::buffer &c, @@ -641,5 +643,695 @@ void trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose t }); } +// USM APIs + +cl::sycl::event gemm(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m, + int64_t n, int64_t k, float alpha, const float *a, int64_t lda, const float *b, + int64_t ldb, float beta, float *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char transa_ = *fortran_char(transa); + const char transb_ = *fortran_char(transb); + host_task(cgh, [=]() { + ::sgemm((const char *)&transa_, (const char *)&transb_, (const MKL_INT *)&m, + (const MKL_INT *)&n, (const MKL_INT *)&k, (const float *)&alpha, a, + (const MKL_INT *)&lda, b, (const MKL_INT *)&ldb, (const float *)&beta, c, + (const MKL_INT *)&ldc); + }); + }); + return done; +} + +cl::sycl::event gemm(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m, + int64_t n, int64_t k, double alpha, const double *a, int64_t lda, + const double *b, int64_t ldb, double beta, double *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char transa_ = *fortran_char(transa); + const char transb_ = *fortran_char(transb); + host_task(cgh, [=]() { + ::dgemm((const char *)&transa_, (const char *)&transb_, (const MKL_INT *)&m, + (const MKL_INT *)&n, (const MKL_INT *)&k, (const double *)&alpha, a, + (const MKL_INT *)&lda, b, (const MKL_INT *)&ldb, (const double *)&beta, c, + (const MKL_INT *)&ldc); + }); + }); + return done; +} + +cl::sycl::event gemm(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m, + int64_t n, int64_t k, std::complex alpha, const std::complex *a, + int64_t lda, const std::complex *b, int64_t ldb, + std::complex beta, std::complex *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char transa_ = *fortran_char(transa); + const char transb_ = *fortran_char(transb); + float alpha_real = alpha.real(), alpha_imag = alpha.imag(); + float beta_real = beta.real(), beta_imag = beta.imag(); + host_task(cgh, [=]() { + MKL_Complex8 alpha_ = { alpha_real, alpha_imag }; + MKL_Complex8 beta_ = { beta_real, beta_imag }; + ::cgemm((const char *)&transa_, (const char *)&transb_, (const MKL_INT *)&m, + (const MKL_INT *)&n, (const MKL_INT *)&k, (const MKL_Complex8 *)&alpha_, a, + (const MKL_INT *)&lda, b, (const MKL_INT *)&ldb, (const MKL_Complex8 *)&beta_, + c, (const MKL_INT *)&ldc); + }); + }); + return done; +} + +cl::sycl::event gemm(cl::sycl::queue &queue, transpose transa, transpose transb, int64_t m, + int64_t n, int64_t k, std::complex alpha, + const std::complex *a, int64_t lda, const std::complex *b, + int64_t ldb, std::complex beta, std::complex *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char transa_ = *fortran_char(transa); + const char transb_ = *fortran_char(transb); + double alpha_real = alpha.real(), alpha_imag = alpha.imag(); + double beta_real = beta.real(), beta_imag = beta.imag(); + host_task(cgh, [=]() { + MKL_Complex16 alpha_ = { alpha_real, alpha_imag }; + MKL_Complex16 beta_ = { beta_real, beta_imag }; + ::zgemm((const char *)&transa_, (const char *)&transb_, (const MKL_INT *)&m, + (const MKL_INT *)&n, (const MKL_INT *)&k, (const MKL_Complex16 *)&alpha_, a, + (const MKL_INT *)&lda, b, (const MKL_INT *)&ldb, (const MKL_Complex16 *)&beta_, + c, (const MKL_INT *)&ldc); + }); + }); + return done; +} + +cl::sycl::event hemm(cl::sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, + int64_t n, std::complex alpha, const std::complex *a, + int64_t lda, const std::complex *b, int64_t ldb, + std::complex beta, std::complex *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char left_right_ = *fortran_char(left_right); + const char upper_lower_ = *fortran_char(upper_lower); + float alpha_real = alpha.real(), alpha_imag = alpha.imag(); + float beta_real = beta.real(), beta_imag = beta.imag(); + host_task(cgh, [=]() { + MKL_Complex8 alpha_ = { alpha_real, alpha_imag }; + MKL_Complex8 beta_ = { beta_real, beta_imag }; + ::chemm((const char *)&left_right_, (const char *)&upper_lower_, (const MKL_INT *)&m, + (const MKL_INT *)&n, (const MKL_Complex8 *)&alpha_, a, (const MKL_INT *)&lda, b, + (const MKL_INT *)&ldb, (const MKL_Complex8 *)&beta_, c, (const MKL_INT *)&ldc); + }); + }); + return done; +} + +cl::sycl::event hemm(cl::sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, + int64_t n, std::complex alpha, const std::complex *a, + int64_t lda, const std::complex *b, int64_t ldb, + std::complex beta, std::complex *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char left_right_ = *fortran_char(left_right); + const char upper_lower_ = *fortran_char(upper_lower); + double alpha_real = alpha.real(), alpha_imag = alpha.imag(); + double beta_real = beta.real(), beta_imag = beta.imag(); + host_task(cgh, [=]() { + MKL_Complex16 alpha_ = { alpha_real, alpha_imag }; + MKL_Complex16 beta_ = { beta_real, beta_imag }; + ::zhemm((const char *)&left_right_, (const char *)&upper_lower_, (const MKL_INT *)&m, + (const MKL_INT *)&n, (const MKL_Complex16 *)&alpha_, a, (const MKL_INT *)&lda, + b, (const MKL_INT *)&ldb, (const MKL_Complex16 *)&beta_, c, + (const MKL_INT *)&ldc); + }); + }); + return done; +} + +cl::sycl::event herk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, + int64_t k, float alpha, const std::complex *a, int64_t lda, float beta, + std::complex *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + host_task(cgh, [=]() { + ::cherk((const char *)&upper_lower_, (const char *)&trans_, (const MKL_INT *)&n, + (const MKL_INT *)&k, (const float *)&alpha, a, (const MKL_INT *)&lda, + (const float *)&beta, c, (const MKL_INT *)&ldc); + }); + }); + return done; +} + +cl::sycl::event herk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, + int64_t k, double alpha, const std::complex *a, int64_t lda, + double beta, std::complex *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + host_task(cgh, [=]() { + ::zherk((const char *)&upper_lower_, (const char *)&trans_, (const MKL_INT *)&n, + (const MKL_INT *)&k, (const double *)&alpha, a, (const MKL_INT *)&lda, + (const double *)&beta, c, (const MKL_INT *)&ldc); + }); + }); + return done; +} + +cl::sycl::event her2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, + int64_t k, std::complex alpha, const std::complex *a, + int64_t lda, const std::complex *b, int64_t ldb, float beta, + std::complex *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + float alpha_real = alpha.real(), alpha_imag = alpha.imag(); + host_task(cgh, [=]() { + MKL_Complex8 alpha_ = { alpha_real, alpha_imag }; + ::cher2k((const char *)&upper_lower_, (const char *)&trans_, (const MKL_INT *)&n, + (const MKL_INT *)&k, (const MKL_Complex8 *)&alpha_, a, (const MKL_INT *)&lda, + b, (const MKL_INT *)&ldb, (const float *)&beta, c, (const MKL_INT *)&ldc); + }); + }); + return done; +} + +cl::sycl::event her2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, + int64_t k, std::complex alpha, const std::complex *a, + int64_t lda, const std::complex *b, int64_t ldb, double beta, + std::complex *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + double alpha_real = alpha.real(), alpha_imag = alpha.imag(); + host_task(cgh, [=]() { + MKL_Complex16 alpha_ = { alpha_real, alpha_imag }; + ::zher2k((const char *)&upper_lower_, (const char *)&trans_, (const MKL_INT *)&n, + (const MKL_INT *)&k, (const MKL_Complex16 *)&alpha_, a, (const MKL_INT *)&lda, + b, (const MKL_INT *)&ldb, (const double *)&beta, c, (const MKL_INT *)&ldc); + }); + }); + return done; +} + +cl::sycl::event symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, + int64_t n, float alpha, const float *a, int64_t lda, const float *b, + int64_t ldb, float beta, float *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char left_right_ = *fortran_char(left_right); + const char upper_lower_ = *fortran_char(upper_lower); + host_task(cgh, [=]() { + ::ssymm((const char *)&left_right_, (const char *)&upper_lower_, (const MKL_INT *)&m, + (const MKL_INT *)&n, (const float *)&alpha, a, (const MKL_INT *)&lda, b, + (const MKL_INT *)&ldb, (const float *)&beta, c, (const MKL_INT *)&ldc); + }); + }); + return done; +} + +cl::sycl::event symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, + int64_t n, double alpha, const double *a, int64_t lda, const double *b, + int64_t ldb, double beta, double *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char left_right_ = *fortran_char(left_right); + const char upper_lower_ = *fortran_char(upper_lower); + host_task(cgh, [=]() { + ::dsymm((const char *)&left_right_, (const char *)&upper_lower_, (const MKL_INT *)&m, + (const MKL_INT *)&n, (const double *)&alpha, a, (const MKL_INT *)&lda, b, + (const MKL_INT *)&ldb, (const double *)&beta, c, (const MKL_INT *)&ldc); + }); + }); + return done; +} + +cl::sycl::event symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, + int64_t n, std::complex alpha, const std::complex *a, + int64_t lda, const std::complex *b, int64_t ldb, + std::complex beta, std::complex *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char left_right_ = *fortran_char(left_right); + const char upper_lower_ = *fortran_char(upper_lower); + float alpha_real = alpha.real(), alpha_imag = alpha.imag(); + float beta_real = beta.real(), beta_imag = beta.imag(); + host_task(cgh, [=]() { + MKL_Complex8 alpha_ = { alpha_real, alpha_imag }; + MKL_Complex8 beta_ = { beta_real, beta_imag }; + ::csymm((const char *)&left_right_, (const char *)&upper_lower_, (const MKL_INT *)&m, + (const MKL_INT *)&n, (const MKL_Complex8 *)&alpha_, a, (const MKL_INT *)&lda, b, + (const MKL_INT *)&ldb, (const MKL_Complex8 *)&beta_, c, (const MKL_INT *)&ldc); + }); + }); + return done; +} + +cl::sycl::event symm(cl::sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, + int64_t n, std::complex alpha, const std::complex *a, + int64_t lda, const std::complex *b, int64_t ldb, + std::complex beta, std::complex *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char left_right_ = *fortran_char(left_right); + const char upper_lower_ = *fortran_char(upper_lower); + double alpha_real = alpha.real(), alpha_imag = alpha.imag(); + double beta_real = beta.real(), beta_imag = beta.imag(); + host_task(cgh, [=]() { + MKL_Complex16 alpha_ = { alpha_real, alpha_imag }; + MKL_Complex16 beta_ = { beta_real, beta_imag }; + ::zsymm((const char *)&left_right_, (const char *)&upper_lower_, (const MKL_INT *)&m, + (const MKL_INT *)&n, (const MKL_Complex16 *)&alpha_, a, (const MKL_INT *)&lda, + b, (const MKL_INT *)&ldb, (const MKL_Complex16 *)&beta_, c, + (const MKL_INT *)&ldc); + }); + }); + return done; +} + +cl::sycl::event syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, + int64_t k, float alpha, const float *a, int64_t lda, float beta, float *c, + int64_t ldc, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + host_task(cgh, [=]() { + ::ssyrk((const char *)&upper_lower_, (const char *)&trans_, (const MKL_INT *)&n, + (const MKL_INT *)&k, (const float *)&alpha, a, (const MKL_INT *)&lda, + (const float *)&beta, c, (const MKL_INT *)&ldc); + }); + }); + return done; +} + +cl::sycl::event syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, + int64_t k, double alpha, const double *a, int64_t lda, double beta, double *c, + int64_t ldc, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + host_task(cgh, [=]() { + ::dsyrk((const char *)&upper_lower_, (const char *)&trans_, (const MKL_INT *)&n, + (const MKL_INT *)&k, (const double *)&alpha, a, (const MKL_INT *)&lda, + (const double *)&beta, c, (const MKL_INT *)&ldc); + }); + }); + return done; +} + +cl::sycl::event syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, + int64_t k, std::complex alpha, const std::complex *a, + int64_t lda, std::complex beta, std::complex *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + float alpha_real = alpha.real(), alpha_imag = alpha.imag(); + float beta_real = beta.real(), beta_imag = beta.imag(); + host_task(cgh, [=]() { + MKL_Complex8 alpha_ = { alpha_real, alpha_imag }; + MKL_Complex8 beta_ = { beta_real, beta_imag }; + ::csyrk((const char *)&upper_lower_, (const char *)&trans_, (const MKL_INT *)&n, + (const MKL_INT *)&k, (const MKL_Complex8 *)&alpha_, a, (const MKL_INT *)&lda, + (const MKL_Complex8 *)&beta_, c, (const MKL_INT *)&ldc); + }); + }); + return done; +} + +cl::sycl::event syrk(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, + int64_t k, std::complex alpha, const std::complex *a, + int64_t lda, std::complex beta, std::complex *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + double alpha_real = alpha.real(), alpha_imag = alpha.imag(); + double beta_real = beta.real(), beta_imag = beta.imag(); + host_task(cgh, [=]() { + MKL_Complex16 alpha_ = { alpha_real, alpha_imag }; + MKL_Complex16 beta_ = { beta_real, beta_imag }; + ::zsyrk((const char *)&upper_lower_, (const char *)&trans_, (const MKL_INT *)&n, + (const MKL_INT *)&k, (const MKL_Complex16 *)&alpha_, a, (const MKL_INT *)&lda, + (const MKL_Complex16 *)&beta_, c, (const MKL_INT *)&ldc); + }); + }); + return done; +} + +cl::sycl::event syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, + int64_t k, float alpha, const float *a, int64_t lda, const float *b, + int64_t ldb, float beta, float *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + host_task(cgh, [=]() { + ::ssyr2k((const char *)&upper_lower_, (const char *)&trans_, (const MKL_INT *)&n, + (const MKL_INT *)&k, (const float *)&alpha, a, (const MKL_INT *)&lda, b, + (const MKL_INT *)&ldb, (const float *)&beta, c, (const MKL_INT *)&ldc); + }); + }); + return done; +} + +cl::sycl::event syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, + int64_t k, double alpha, const double *a, int64_t lda, const double *b, + int64_t ldb, double beta, double *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + host_task(cgh, [=]() { + ::dsyr2k((const char *)&upper_lower_, (const char *)&trans_, (const MKL_INT *)&n, + (const MKL_INT *)&k, (const double *)&alpha, a, (const MKL_INT *)&lda, b, + (const MKL_INT *)&ldb, (const double *)&beta, c, (const MKL_INT *)&ldc); + }); + }); + return done; +} + +cl::sycl::event syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, + int64_t k, std::complex alpha, const std::complex *a, + int64_t lda, const std::complex *b, int64_t ldb, + std::complex beta, std::complex *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + float alpha_real = alpha.real(), alpha_imag = alpha.imag(); + float beta_real = beta.real(), beta_imag = beta.imag(); + host_task(cgh, [=]() { + MKL_Complex8 alpha_ = { alpha_real, alpha_imag }; + MKL_Complex8 beta_ = { beta_real, beta_imag }; + ::csyr2k((const char *)&upper_lower_, (const char *)&trans_, (const MKL_INT *)&n, + (const MKL_INT *)&k, (const MKL_Complex8 *)&alpha_, a, (const MKL_INT *)&lda, + b, (const MKL_INT *)&ldb, (const MKL_Complex8 *)&beta_, c, + (const MKL_INT *)&ldc); + }); + }); + return done; +} + +cl::sycl::event syr2k(cl::sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, + int64_t k, std::complex alpha, const std::complex *a, + int64_t lda, const std::complex *b, int64_t ldb, + std::complex beta, std::complex *c, int64_t ldc, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char upper_lower_ = *fortran_char(upper_lower); + const char trans_ = *fortran_char(trans); + double alpha_real = alpha.real(), alpha_imag = alpha.imag(); + double beta_real = beta.real(), beta_imag = beta.imag(); + host_task(cgh, [=]() { + MKL_Complex16 alpha_ = { alpha_real, alpha_imag }; + MKL_Complex16 beta_ = { beta_real, beta_imag }; + ::zsyr2k((const char *)&upper_lower_, (const char *)&trans_, (const MKL_INT *)&n, + (const MKL_INT *)&k, (const MKL_Complex16 *)&alpha_, a, (const MKL_INT *)&lda, + b, (const MKL_INT *)&ldb, (const MKL_Complex16 *)&beta_, c, + (const MKL_INT *)&ldc); + }); + }); + return done; +} + +cl::sycl::event trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, + diag unit_diag, int64_t m, int64_t n, float alpha, const float *a, int64_t lda, + float *b, int64_t ldb, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char left_right_ = *fortran_char(left_right); + const char upper_lower_ = *fortran_char(upper_lower); + const char transa_ = *fortran_char(transa); + const char unit_diag_ = *fortran_char(unit_diag); + host_task(cgh, [=]() { + ::strmm((const char *)&left_right_, (const char *)&upper_lower_, (const char *)&transa_, + (const char *)&unit_diag_, (const MKL_INT *)&m, (const MKL_INT *)&n, + (const float *)&alpha, a, (const MKL_INT *)&lda, b, (const MKL_INT *)&ldb); + }); + }); + return done; +} + +cl::sycl::event trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, + diag unit_diag, int64_t m, int64_t n, double alpha, const double *a, + int64_t lda, double *b, int64_t ldb, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char left_right_ = *fortran_char(left_right); + const char upper_lower_ = *fortran_char(upper_lower); + const char transa_ = *fortran_char(transa); + const char unit_diag_ = *fortran_char(unit_diag); + host_task(cgh, [=]() { + ::dtrmm((const char *)&left_right_, (const char *)&upper_lower_, (const char *)&transa_, + (const char *)&unit_diag_, (const MKL_INT *)&m, (const MKL_INT *)&n, + (const double *)&alpha, a, (const MKL_INT *)&lda, b, (const MKL_INT *)&ldb); + }); + }); + return done; +} + +cl::sycl::event trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, + diag unit_diag, int64_t m, int64_t n, std::complex alpha, + const std::complex *a, int64_t lda, std::complex *b, int64_t ldb, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char left_right_ = *fortran_char(left_right); + const char upper_lower_ = *fortran_char(upper_lower); + const char transa_ = *fortran_char(transa); + const char unit_diag_ = *fortran_char(unit_diag); + float alpha_real = alpha.real(), alpha_imag = alpha.imag(); + host_task(cgh, [=]() { + MKL_Complex8 alpha_ = { alpha_real, alpha_imag }; + ::ctrmm((const char *)&left_right_, (const char *)&upper_lower_, (const char *)&transa_, + (const char *)&unit_diag_, (const MKL_INT *)&m, (const MKL_INT *)&n, + (const MKL_Complex8 *)&alpha_, a, (const MKL_INT *)&lda, b, + (const MKL_INT *)&ldb); + }); + }); + return done; +} + +cl::sycl::event trmm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, + diag unit_diag, int64_t m, int64_t n, std::complex alpha, + const std::complex *a, int64_t lda, std::complex *b, + int64_t ldb, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char left_right_ = *fortran_char(left_right); + const char upper_lower_ = *fortran_char(upper_lower); + const char transa_ = *fortran_char(transa); + const char unit_diag_ = *fortran_char(unit_diag); + double alpha_real = alpha.real(), alpha_imag = alpha.imag(); + host_task(cgh, [=]() { + MKL_Complex16 alpha_ = { alpha_real, alpha_imag }; + ::ztrmm((const char *)&left_right_, (const char *)&upper_lower_, (const char *)&transa_, + (const char *)&unit_diag_, (const MKL_INT *)&m, (const MKL_INT *)&n, + (const MKL_Complex16 *)&alpha_, a, (const MKL_INT *)&lda, b, + (const MKL_INT *)&ldb); + }); + }); + return done; +} + +cl::sycl::event trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, + diag unit_diag, int64_t m, int64_t n, float alpha, const float *a, int64_t lda, + float *b, int64_t ldb, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char left_right_ = *fortran_char(left_right); + const char upper_lower_ = *fortran_char(upper_lower); + const char transa_ = *fortran_char(transa); + const char unit_diag_ = *fortran_char(unit_diag); + host_task(cgh, [=]() { + ::strsm((const char *)&left_right_, (const char *)&upper_lower_, (const char *)&transa_, + (const char *)&unit_diag_, (const MKL_INT *)&m, (const MKL_INT *)&n, + (const float *)&alpha, a, (const MKL_INT *)&lda, b, (const MKL_INT *)&ldb); + }); + }); + return done; +} + +cl::sycl::event trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, + diag unit_diag, int64_t m, int64_t n, double alpha, const double *a, + int64_t lda, double *b, int64_t ldb, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char left_right_ = *fortran_char(left_right); + const char upper_lower_ = *fortran_char(upper_lower); + const char transa_ = *fortran_char(transa); + const char unit_diag_ = *fortran_char(unit_diag); + host_task(cgh, [=]() { + ::dtrsm((const char *)&left_right_, (const char *)&upper_lower_, (const char *)&transa_, + (const char *)&unit_diag_, (const MKL_INT *)&m, (const MKL_INT *)&n, + (const double *)&alpha, a, (const MKL_INT *)&lda, b, (const MKL_INT *)&ldb); + }); + }); + return done; +} + +cl::sycl::event trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, + diag unit_diag, int64_t m, int64_t n, std::complex alpha, + const std::complex *a, int64_t lda, std::complex *b, int64_t ldb, + const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char left_right_ = *fortran_char(left_right); + const char upper_lower_ = *fortran_char(upper_lower); + const char transa_ = *fortran_char(transa); + const char unit_diag_ = *fortran_char(unit_diag); + float alpha_real = alpha.real(), alpha_imag = alpha.imag(); + host_task(cgh, [=]() { + MKL_Complex8 alpha_ = { alpha_real, alpha_imag }; + ::ctrsm((const char *)&left_right_, (const char *)&upper_lower_, (const char *)&transa_, + (const char *)&unit_diag_, (const MKL_INT *)&m, (const MKL_INT *)&n, + (const MKL_Complex8 *)&alpha_, a, (const MKL_INT *)&lda, b, + (const MKL_INT *)&ldb); + }); + }); + return done; +} + +cl::sycl::event trsm(cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, + diag unit_diag, int64_t m, int64_t n, std::complex alpha, + const std::complex *a, int64_t lda, std::complex *b, + int64_t ldb, const cl::sycl::vector_class &dependencies) { + auto done = queue.submit([&](cl::sycl::handler &cgh) { + int64_t num_events = dependencies.size(); + for (int64_t i = 0; i < num_events; i++) { + cgh.depends_on(dependencies[i]); + } + const char left_right_ = *fortran_char(left_right); + const char upper_lower_ = *fortran_char(upper_lower); + const char transa_ = *fortran_char(transa); + const char unit_diag_ = *fortran_char(unit_diag); + double alpha_real = alpha.real(), alpha_imag = alpha.imag(); + host_task(cgh, [=]() { + MKL_Complex16 alpha_ = { alpha_real, alpha_imag }; + ::ztrsm((const char *)&left_right_, (const char *)&upper_lower_, (const char *)&transa_, + (const char *)&unit_diag_, (const MKL_INT *)&m, (const MKL_INT *)&n, + (const MKL_Complex16 *)&alpha_, a, (const MKL_INT *)&lda, b, + (const MKL_INT *)&ldb); + }); + }); + return done; +} + } // namespace mklcpu } // namespace onemkl diff --git a/src/blas/backends/mklcpu/mkl_blas_cpu_wrappers.cpp b/src/blas/backends/mklcpu/mkl_blas_cpu_wrappers.cpp index 0cfa0483a..00d89481f 100644 --- a/src/blas/backends/mklcpu/mkl_blas_cpu_wrappers.cpp +++ b/src/blas/backends/mklcpu/mkl_blas_cpu_wrappers.cpp @@ -179,14 +179,6 @@ extern "C" ONEMKL_EXPORT function_table_t mkl_blas_table = { onemkl::mklcpu::gemm_batch, onemkl::mklcpu::gemm_batch, onemkl::mklcpu::gemm_batch, - onemkl::mklcpu::gemm_batch, - onemkl::mklcpu::gemm_batch, - onemkl::mklcpu::gemm_batch, - onemkl::mklcpu::gemm_batch, - onemkl::mklcpu::trsm_batch, - onemkl::mklcpu::trsm_batch, - onemkl::mklcpu::trsm_batch, - onemkl::mklcpu::trsm_batch, onemkl::mklcpu::trsm_batch, onemkl::mklcpu::trsm_batch, onemkl::mklcpu::trsm_batch, @@ -202,4 +194,170 @@ extern "C" ONEMKL_EXPORT function_table_t mkl_blas_table = { onemkl::mklcpu::gemm_ext, onemkl::mklcpu::gemm_ext, onemkl::mklcpu::gemm_ext, + onemkl::mklcpu::asum, + onemkl::mklcpu::asum, + onemkl::mklcpu::asum, + onemkl::mklcpu::asum, + onemkl::mklcpu::axpy, + onemkl::mklcpu::axpy, + onemkl::mklcpu::axpy, + onemkl::mklcpu::axpy, + onemkl::mklcpu::axpy_batch, + onemkl::mklcpu::axpy_batch, + onemkl::mklcpu::axpy_batch, + onemkl::mklcpu::axpy_batch, + onemkl::mklcpu::copy, + onemkl::mklcpu::copy, + onemkl::mklcpu::copy, + onemkl::mklcpu::copy, + onemkl::mklcpu::dot, + onemkl::mklcpu::dot, + onemkl::mklcpu::dot, + onemkl::mklcpu::dotc, + onemkl::mklcpu::dotc, + onemkl::mklcpu::dotu, + onemkl::mklcpu::dotu, + onemkl::mklcpu::iamin, + onemkl::mklcpu::iamin, + onemkl::mklcpu::iamin, + onemkl::mklcpu::iamin, + onemkl::mklcpu::iamax, + onemkl::mklcpu::iamax, + onemkl::mklcpu::iamax, + onemkl::mklcpu::iamax, + onemkl::mklcpu::nrm2, + onemkl::mklcpu::nrm2, + onemkl::mklcpu::nrm2, + onemkl::mklcpu::nrm2, + onemkl::mklcpu::rot, + onemkl::mklcpu::rot, + onemkl::mklcpu::rot, + onemkl::mklcpu::rot, + onemkl::mklcpu::rotg, + onemkl::mklcpu::rotg, + onemkl::mklcpu::rotg, + onemkl::mklcpu::rotg, + onemkl::mklcpu::rotm, + onemkl::mklcpu::rotm, + onemkl::mklcpu::rotmg, + onemkl::mklcpu::rotmg, + onemkl::mklcpu::scal, + onemkl::mklcpu::scal, + onemkl::mklcpu::scal, + onemkl::mklcpu::scal, + onemkl::mklcpu::scal, + onemkl::mklcpu::scal, + onemkl::mklcpu::sdsdot, + onemkl::mklcpu::swap, + onemkl::mklcpu::swap, + onemkl::mklcpu::swap, + onemkl::mklcpu::swap, + onemkl::mklcpu::gbmv, + onemkl::mklcpu::gbmv, + onemkl::mklcpu::gbmv, + onemkl::mklcpu::gbmv, + onemkl::mklcpu::gemv, + onemkl::mklcpu::gemv, + onemkl::mklcpu::gemv, + onemkl::mklcpu::gemv, + onemkl::mklcpu::ger, + onemkl::mklcpu::ger, + onemkl::mklcpu::gerc, + onemkl::mklcpu::gerc, + onemkl::mklcpu::geru, + onemkl::mklcpu::geru, + onemkl::mklcpu::hbmv, + onemkl::mklcpu::hbmv, + onemkl::mklcpu::hemv, + onemkl::mklcpu::hemv, + onemkl::mklcpu::her, + onemkl::mklcpu::her, + onemkl::mklcpu::her2, + onemkl::mklcpu::her2, + onemkl::mklcpu::hpmv, + onemkl::mklcpu::hpmv, + onemkl::mklcpu::hpr, + onemkl::mklcpu::hpr, + onemkl::mklcpu::hpr2, + onemkl::mklcpu::hpr2, + onemkl::mklcpu::sbmv, + onemkl::mklcpu::sbmv, + onemkl::mklcpu::spmv, + onemkl::mklcpu::spmv, + onemkl::mklcpu::spr, + onemkl::mklcpu::spr, + onemkl::mklcpu::spr2, + onemkl::mklcpu::spr2, + onemkl::mklcpu::symv, + onemkl::mklcpu::symv, + onemkl::mklcpu::syr, + onemkl::mklcpu::syr, + onemkl::mklcpu::syr2, + onemkl::mklcpu::syr2, + onemkl::mklcpu::tbmv, + onemkl::mklcpu::tbmv, + onemkl::mklcpu::tbmv, + onemkl::mklcpu::tbmv, + onemkl::mklcpu::tbsv, + onemkl::mklcpu::tbsv, + onemkl::mklcpu::tbsv, + onemkl::mklcpu::tbsv, + onemkl::mklcpu::tpmv, + onemkl::mklcpu::tpmv, + onemkl::mklcpu::tpmv, + onemkl::mklcpu::tpmv, + onemkl::mklcpu::tpsv, + onemkl::mklcpu::tpsv, + onemkl::mklcpu::tpsv, + onemkl::mklcpu::tpsv, + onemkl::mklcpu::trmv, + onemkl::mklcpu::trmv, + onemkl::mklcpu::trmv, + onemkl::mklcpu::trmv, + onemkl::mklcpu::trsv, + onemkl::mklcpu::trsv, + onemkl::mklcpu::trsv, + onemkl::mklcpu::trsv, + onemkl::mklcpu::gemm, + onemkl::mklcpu::gemm, + onemkl::mklcpu::gemm, + onemkl::mklcpu::gemm, + onemkl::mklcpu::hemm, + onemkl::mklcpu::hemm, + onemkl::mklcpu::herk, + onemkl::mklcpu::herk, + onemkl::mklcpu::her2k, + onemkl::mklcpu::her2k, + onemkl::mklcpu::symm, + onemkl::mklcpu::symm, + onemkl::mklcpu::symm, + onemkl::mklcpu::symm, + onemkl::mklcpu::syrk, + onemkl::mklcpu::syrk, + onemkl::mklcpu::syrk, + onemkl::mklcpu::syrk, + onemkl::mklcpu::syr2k, + onemkl::mklcpu::syr2k, + onemkl::mklcpu::syr2k, + onemkl::mklcpu::syr2k, + onemkl::mklcpu::trmm, + onemkl::mklcpu::trmm, + onemkl::mklcpu::trmm, + onemkl::mklcpu::trmm, + onemkl::mklcpu::trsm, + onemkl::mklcpu::trsm, + onemkl::mklcpu::trsm, + onemkl::mklcpu::trsm, + onemkl::mklcpu::gemm_batch, + onemkl::mklcpu::gemm_batch, + onemkl::mklcpu::gemm_batch, + onemkl::mklcpu::gemm_batch, + onemkl::mklcpu::gemm_batch, + onemkl::mklcpu::gemm_batch, + onemkl::mklcpu::gemm_batch, + onemkl::mklcpu::gemm_batch, + onemkl::mklcpu::gemmt, + onemkl::mklcpu::gemmt, + onemkl::mklcpu::gemmt, + onemkl::mklcpu::gemmt, }; diff --git a/src/blas/backends/mklgpu/CMakeLists.txt b/src/blas/backends/mklgpu/CMakeLists.txt index 44f42facd..f2f35c831 100644 --- a/src/blas/backends/mklgpu/CMakeLists.txt +++ b/src/blas/backends/mklgpu/CMakeLists.txt @@ -25,6 +25,7 @@ add_library(${LIB_NAME}) add_library(${LIB_OBJ} OBJECT mkl_internal_blas_gpu_wrappers.cpp mkl_blas_sycl_buffer.cpp + mkl_blas_sycl_usm.cpp $<$: mkl_blas_gpu_wrappers.cpp> ) @@ -44,7 +45,6 @@ set_target_properties(${LIB_OBJ} PROPERTIES ) target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ}) -#Set MKL libraries as not transitive for dynamic if(BUILD_SHARED_LIBS) set_target_properties(${LIB_NAME} PROPERTIES INTERFACE_LINK_LIBRARIES ONEMKL::SYCL::SYCL diff --git a/src/blas/backends/mklgpu/mkl_blas_gpu_wrappers.cpp b/src/blas/backends/mklgpu/mkl_blas_gpu_wrappers.cpp index a54717dd0..5d4f81e70 100644 --- a/src/blas/backends/mklgpu/mkl_blas_gpu_wrappers.cpp +++ b/src/blas/backends/mklgpu/mkl_blas_gpu_wrappers.cpp @@ -179,14 +179,6 @@ extern "C" ONEMKL_EXPORT function_table_t mkl_blas_table = { onemkl::mklgpu::gemm_batch, onemkl::mklgpu::gemm_batch, onemkl::mklgpu::gemm_batch, - onemkl::mklgpu::gemm_batch, - onemkl::mklgpu::gemm_batch, - onemkl::mklgpu::gemm_batch, - onemkl::mklgpu::gemm_batch, - onemkl::mklgpu::trsm_batch, - onemkl::mklgpu::trsm_batch, - onemkl::mklgpu::trsm_batch, - onemkl::mklgpu::trsm_batch, onemkl::mklgpu::trsm_batch, onemkl::mklgpu::trsm_batch, onemkl::mklgpu::trsm_batch, @@ -202,4 +194,170 @@ extern "C" ONEMKL_EXPORT function_table_t mkl_blas_table = { onemkl::mklgpu::gemm_ext, onemkl::mklgpu::gemm_ext, onemkl::mklgpu::gemm_ext, + onemkl::mklgpu::asum, + onemkl::mklgpu::asum, + onemkl::mklgpu::asum, + onemkl::mklgpu::asum, + onemkl::mklgpu::axpy, + onemkl::mklgpu::axpy, + onemkl::mklgpu::axpy, + onemkl::mklgpu::axpy, + onemkl::mklgpu::axpy_batch, + onemkl::mklgpu::axpy_batch, + onemkl::mklgpu::axpy_batch, + onemkl::mklgpu::axpy_batch, + onemkl::mklgpu::copy, + onemkl::mklgpu::copy, + onemkl::mklgpu::copy, + onemkl::mklgpu::copy, + onemkl::mklgpu::dot, + onemkl::mklgpu::dot, + onemkl::mklgpu::dot, + onemkl::mklgpu::dotc, + onemkl::mklgpu::dotc, + onemkl::mklgpu::dotu, + onemkl::mklgpu::dotu, + onemkl::mklgpu::iamin, + onemkl::mklgpu::iamin, + onemkl::mklgpu::iamin, + onemkl::mklgpu::iamin, + onemkl::mklgpu::iamax, + onemkl::mklgpu::iamax, + onemkl::mklgpu::iamax, + onemkl::mklgpu::iamax, + onemkl::mklgpu::nrm2, + onemkl::mklgpu::nrm2, + onemkl::mklgpu::nrm2, + onemkl::mklgpu::nrm2, + onemkl::mklgpu::rot, + onemkl::mklgpu::rot, + onemkl::mklgpu::rot, + onemkl::mklgpu::rot, + onemkl::mklgpu::rotg, + onemkl::mklgpu::rotg, + onemkl::mklgpu::rotg, + onemkl::mklgpu::rotg, + onemkl::mklgpu::rotm, + onemkl::mklgpu::rotm, + onemkl::mklgpu::rotmg, + onemkl::mklgpu::rotmg, + onemkl::mklgpu::scal, + onemkl::mklgpu::scal, + onemkl::mklgpu::scal, + onemkl::mklgpu::scal, + onemkl::mklgpu::scal, + onemkl::mklgpu::scal, + onemkl::mklgpu::sdsdot, + onemkl::mklgpu::swap, + onemkl::mklgpu::swap, + onemkl::mklgpu::swap, + onemkl::mklgpu::swap, + onemkl::mklgpu::gbmv, + onemkl::mklgpu::gbmv, + onemkl::mklgpu::gbmv, + onemkl::mklgpu::gbmv, + onemkl::mklgpu::gemv, + onemkl::mklgpu::gemv, + onemkl::mklgpu::gemv, + onemkl::mklgpu::gemv, + onemkl::mklgpu::ger, + onemkl::mklgpu::ger, + onemkl::mklgpu::gerc, + onemkl::mklgpu::gerc, + onemkl::mklgpu::geru, + onemkl::mklgpu::geru, + onemkl::mklgpu::hbmv, + onemkl::mklgpu::hbmv, + onemkl::mklgpu::hemv, + onemkl::mklgpu::hemv, + onemkl::mklgpu::her, + onemkl::mklgpu::her, + onemkl::mklgpu::her2, + onemkl::mklgpu::her2, + onemkl::mklgpu::hpmv, + onemkl::mklgpu::hpmv, + onemkl::mklgpu::hpr, + onemkl::mklgpu::hpr, + onemkl::mklgpu::hpr2, + onemkl::mklgpu::hpr2, + onemkl::mklgpu::sbmv, + onemkl::mklgpu::sbmv, + onemkl::mklgpu::spmv, + onemkl::mklgpu::spmv, + onemkl::mklgpu::spr, + onemkl::mklgpu::spr, + onemkl::mklgpu::spr2, + onemkl::mklgpu::spr2, + onemkl::mklgpu::symv, + onemkl::mklgpu::symv, + onemkl::mklgpu::syr, + onemkl::mklgpu::syr, + onemkl::mklgpu::syr2, + onemkl::mklgpu::syr2, + onemkl::mklgpu::tbmv, + onemkl::mklgpu::tbmv, + onemkl::mklgpu::tbmv, + onemkl::mklgpu::tbmv, + onemkl::mklgpu::tbsv, + onemkl::mklgpu::tbsv, + onemkl::mklgpu::tbsv, + onemkl::mklgpu::tbsv, + onemkl::mklgpu::tpmv, + onemkl::mklgpu::tpmv, + onemkl::mklgpu::tpmv, + onemkl::mklgpu::tpmv, + onemkl::mklgpu::tpsv, + onemkl::mklgpu::tpsv, + onemkl::mklgpu::tpsv, + onemkl::mklgpu::tpsv, + onemkl::mklgpu::trmv, + onemkl::mklgpu::trmv, + onemkl::mklgpu::trmv, + onemkl::mklgpu::trmv, + onemkl::mklgpu::trsv, + onemkl::mklgpu::trsv, + onemkl::mklgpu::trsv, + onemkl::mklgpu::trsv, + onemkl::mklgpu::gemm, + onemkl::mklgpu::gemm, + onemkl::mklgpu::gemm, + onemkl::mklgpu::gemm, + onemkl::mklgpu::hemm, + onemkl::mklgpu::hemm, + onemkl::mklgpu::herk, + onemkl::mklgpu::herk, + onemkl::mklgpu::her2k, + onemkl::mklgpu::her2k, + onemkl::mklgpu::symm, + onemkl::mklgpu::symm, + onemkl::mklgpu::symm, + onemkl::mklgpu::symm, + onemkl::mklgpu::syrk, + onemkl::mklgpu::syrk, + onemkl::mklgpu::syrk, + onemkl::mklgpu::syrk, + onemkl::mklgpu::syr2k, + onemkl::mklgpu::syr2k, + onemkl::mklgpu::syr2k, + onemkl::mklgpu::syr2k, + onemkl::mklgpu::trmm, + onemkl::mklgpu::trmm, + onemkl::mklgpu::trmm, + onemkl::mklgpu::trmm, + onemkl::mklgpu::trsm, + onemkl::mklgpu::trsm, + onemkl::mklgpu::trsm, + onemkl::mklgpu::trsm, + onemkl::mklgpu::gemm_batch, + onemkl::mklgpu::gemm_batch, + onemkl::mklgpu::gemm_batch, + onemkl::mklgpu::gemm_batch, + onemkl::mklgpu::gemm_batch, + onemkl::mklgpu::gemm_batch, + onemkl::mklgpu::gemm_batch, + onemkl::mklgpu::gemm_batch, + onemkl::mklgpu::gemmt, + onemkl::mklgpu::gemmt, + onemkl::mklgpu::gemmt, + onemkl::mklgpu::gemmt, }; diff --git a/src/blas/backends/mklgpu/mkl_blas_sycl_buffer.cpp b/src/blas/backends/mklgpu/mkl_blas_sycl_buffer.cpp index 877254e5d..489bf8a2f 100644 --- a/src/blas/backends/mklgpu/mkl_blas_sycl_buffer.cpp +++ b/src/blas/backends/mklgpu/mkl_blas_sycl_buffer.cpp @@ -989,59 +989,6 @@ void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, - cl::sycl::buffer &m, cl::sycl::buffer &n, - cl::sycl::buffer &k, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, - cl::sycl::buffer &beta, cl::sycl::buffer &c, - cl::sycl::buffer &ldc, std::int64_t group_count, - cl::sycl::buffer &group_size) { - onemkl::mklgpu::internal::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, - beta, c, ldc, group_count, group_size); -} - -void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, - cl::sycl::buffer &m, cl::sycl::buffer &n, - cl::sycl::buffer &k, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, - cl::sycl::buffer &beta, cl::sycl::buffer &c, - cl::sycl::buffer &ldc, std::int64_t group_count, - cl::sycl::buffer &group_size) { - onemkl::mklgpu::internal::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, - beta, c, ldc, group_count, group_size); -} - -void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, - cl::sycl::buffer &m, cl::sycl::buffer &n, - cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size) { - onemkl::mklgpu::internal::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, - beta, c, ldc, group_count, group_size); -} - -void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size) { - onemkl::mklgpu::internal::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, - beta, c, ldc, group_count, group_size); -} - void gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, @@ -1084,55 +1031,6 @@ void gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transp ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, - cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { - onemkl::mklgpu::internal::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, - alpha, a, lda, b, ldb, group_count, group_size); -} - -void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, - cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { - onemkl::mklgpu::internal::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, - alpha, a, lda, b, ldb, group_count, group_size); -} - -void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, - cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, - cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { - onemkl::mklgpu::internal::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, - alpha, a, lda, b, ldb, group_count, group_size); -} - -void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { - onemkl::mklgpu::internal::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, - alpha, a, lda, b, ldb, group_count, group_size); -} - void trsm_batch(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, float alpha, cl::sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, diff --git a/src/blas/backends/mklgpu/mkl_blas_sycl_usm.cpp b/src/blas/backends/mklgpu/mkl_blas_sycl_usm.cpp new file mode 100644 index 000000000..325aec6ee --- /dev/null +++ b/src/blas/backends/mklgpu/mkl_blas_sycl_usm.cpp @@ -0,0 +1,1332 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include + +#include "mkl_internal_blas_gpu_wrappers.hpp" +#include "onemkl/blas/detail/mklgpu/onemkl_blas_mklgpu.hpp" +#include "onemkl/types.hpp" + +namespace onemkl { +namespace mklgpu { + +cl::sycl::event gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float *a, + std::int64_t lda, const float *b, std::int64_t ldb, float beta, float *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, + beta, c, ldc, dependencies); +} + +cl::sycl::event gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double *a, + std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, + beta, c, ldc, dependencies); +} + +cl::sycl::event gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, + beta, c, ldc, dependencies); +} + +cl::sycl::event gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, + beta, c, ldc, dependencies); +} + +cl::sycl::event symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, + const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, + ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + std::int64_t m, std::int64_t n, double alpha, const double *a, + std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, + ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, + ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, + ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event hemm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::hemm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, + ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event hemm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::hemm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, + ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, + float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, + ldc, dependencies); +} + +cl::sycl::event syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, double alpha, const double *a, + std::int64_t lda, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, + ldc, dependencies); +} + +cl::sycl::event syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, + ldc, dependencies); +} + +cl::sycl::event syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, + ldc, dependencies); +} + +cl::sycl::event herk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, float alpha, const std::complex *a, + std::int64_t lda, float beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::herk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, + ldc, dependencies); +} + +cl::sycl::event herk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, double alpha, const std::complex *a, + std::int64_t lda, double beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::herk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, + ldc, dependencies); +} + +cl::sycl::event syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, + const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, + beta, c, ldc, dependencies); +} + +cl::sycl::event syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, double alpha, const double *a, + std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, + beta, c, ldc, dependencies); +} + +cl::sycl::event syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, + beta, c, ldc, dependencies); +} + +cl::sycl::event syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, + beta, c, ldc, dependencies); +} + +cl::sycl::event her2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, float beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::her2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, + beta, c, ldc, dependencies); +} + +cl::sycl::event her2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, double beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::her2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, + beta, c, ldc, dependencies); +} + +cl::sycl::event trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, + std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b, + std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n, + alpha, a, lda, b, ldb, dependencies); +} + +cl::sycl::event trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, + std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b, + std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n, + alpha, a, lda, b, ldb, dependencies); +} + +cl::sycl::event trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n, + alpha, a, lda, b, ldb, dependencies); +} + +cl::sycl::event trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::trmm(queue, left_right, upper_lower, trans, unit_diag, m, n, + alpha, a, lda, b, ldb, dependencies); +} + +cl::sycl::event trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, + std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b, + std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n, + alpha, a, lda, b, ldb, dependencies); +} + +cl::sycl::event trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, + std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b, + std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n, + alpha, a, lda, b, ldb, dependencies); +} + +cl::sycl::event trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n, + alpha, a, lda, b, ldb, dependencies); +} + +cl::sycl::event trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::trsm(queue, left_right, upper_lower, trans, unit_diag, m, n, + alpha, a, lda, b, ldb, dependencies); +} + +cl::sycl::event gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, + std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *x, + std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, + dependencies); +} + +cl::sycl::event gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, + std::int64_t n, double alpha, const double *a, std::int64_t lda, + const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, + dependencies); +} + +cl::sycl::event gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, + dependencies); +} + +cl::sycl::event gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, + dependencies); +} + +cl::sycl::event gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, const float *a, + std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, + y, incy, dependencies); +} + +cl::sycl::event gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, + const double *a, std::int64_t lda, const double *x, std::int64_t incx, + double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, + y, incy, dependencies); +} + +cl::sycl::event gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *x, + std::int64_t incx, std::complex beta, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, + y, incy, dependencies); +} + +cl::sycl::event gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *x, + std::int64_t incx, std::complex beta, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, + y, incy, dependencies); +} + +cl::sycl::event ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, + const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a, + std::int64_t lda, const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::ger(queue, m, n, alpha, x, incx, y, incy, a, lda, + dependencies); +} + +cl::sycl::event ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, + const double *x, std::int64_t incx, const double *y, std::int64_t incy, + double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::ger(queue, m, n, alpha, x, incx, y, incy, a, lda, + dependencies); +} + +cl::sycl::event gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::gerc(queue, m, n, alpha, x, incx, y, incy, a, lda, + dependencies); +} + +cl::sycl::event gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::gerc(queue, m, n, alpha, x, incx, y, incy, a, lda, + dependencies); +} + +cl::sycl::event geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::geru(queue, m, n, alpha, x, incx, y, incy, a, lda, + dependencies); +} + +cl::sycl::event geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::geru(queue, m, n, alpha, x, incx, y, incy, a, lda, + dependencies); +} + +cl::sycl::event hbmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::hbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, + incy, dependencies); +} + +cl::sycl::event hbmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::hbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, + incy, dependencies); +} + +cl::sycl::event hemv(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::hemv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, + incy, dependencies); +} + +cl::sycl::event hemv(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::hemv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, + incy, dependencies); +} + +cl::sycl::event her(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, float alpha, + const std::complex *x, std::int64_t incx, std::complex *a, + std::int64_t lda, const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::her(queue, upper_lower, n, alpha, x, incx, a, lda, + dependencies); +} + +cl::sycl::event her(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, double alpha, + const std::complex *x, std::int64_t incx, std::complex *a, + std::int64_t lda, const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::her(queue, upper_lower, n, alpha, x, incx, a, lda, + dependencies); +} + +cl::sycl::event her2(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::her2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, + dependencies); +} + +cl::sycl::event her2(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::her2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, + dependencies); +} + +cl::sycl::event hpmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *a, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::hpmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, + dependencies); +} + +cl::sycl::event hpmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *a, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::hpmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, + dependencies); +} + +cl::sycl::event hpr(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, float alpha, + const std::complex *x, std::int64_t incx, std::complex *a, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::hpr(queue, upper_lower, n, alpha, x, incx, a, dependencies); +} + +cl::sycl::event hpr(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, double alpha, + const std::complex *x, std::int64_t incx, std::complex *a, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::hpr(queue, upper_lower, n, alpha, x, incx, a, dependencies); +} + +cl::sycl::event hpr2(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::hpr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, + dependencies); +} + +cl::sycl::event hpr2(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::hpr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, + dependencies); +} + +cl::sycl::event sbmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, + std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *x, + std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::sbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, + incy, dependencies); +} + +cl::sycl::event sbmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, + std::int64_t k, double alpha, const double *a, std::int64_t lda, + const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::sbmv(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, + incy, dependencies); +} + +cl::sycl::event symv(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, float alpha, + const float *a, std::int64_t lda, const float *x, std::int64_t incx, + float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::symv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, + incy, dependencies); +} + +cl::sycl::event symv(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, double alpha, + const double *a, std::int64_t lda, const double *x, std::int64_t incx, + double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::symv(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, + incy, dependencies); +} + +cl::sycl::event syr(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, float alpha, + const float *x, std::int64_t incx, float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::syr(queue, upper_lower, n, alpha, x, incx, a, lda, + dependencies); +} + +cl::sycl::event syr(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, double alpha, + const double *x, std::int64_t incx, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::syr(queue, upper_lower, n, alpha, x, incx, a, lda, + dependencies); +} + +cl::sycl::event syr2(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, float alpha, + const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::syr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, + dependencies); +} + +cl::sycl::event syr2(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, double alpha, + const double *x, std::int64_t incx, const double *y, std::int64_t incy, + double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::syr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, + dependencies); +} + +cl::sycl::event spmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, float alpha, + const float *a, const float *x, std::int64_t incx, float beta, float *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::spmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, + dependencies); +} + +cl::sycl::event spmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, double alpha, + const double *a, const double *x, std::int64_t incx, double beta, double *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::spmv(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, + dependencies); +} + +cl::sycl::event spr(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, float alpha, + const float *x, std::int64_t incx, float *a, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::spr(queue, upper_lower, n, alpha, x, incx, a, dependencies); +} + +cl::sycl::event spr(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, double alpha, + const double *x, std::int64_t incx, double *a, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::spr(queue, upper_lower, n, alpha, x, incx, a, dependencies); +} + +cl::sycl::event spr2(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, float alpha, + const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::spr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, + dependencies); +} + +cl::sycl::event spr2(cl::sycl::queue &queue, onemkl::uplo upper_lower, std::int64_t n, double alpha, + const double *x, std::int64_t incx, const double *y, std::int64_t incy, + double *a, const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::spr2(queue, upper_lower, n, alpha, x, incx, y, incy, a, + dependencies); +} + +cl::sycl::event tbmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, std::int64_t k, const float *a, + std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, + incx, dependencies); +} + +cl::sycl::event tbmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, std::int64_t k, const double *a, + std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, + incx, dependencies); +} + +cl::sycl::event tbmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, std::int64_t k, + const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, + incx, dependencies); +} + +cl::sycl::event tbmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, std::int64_t k, + const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::tbmv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, + incx, dependencies); +} + +cl::sycl::event tbsv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, std::int64_t k, const float *a, + std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, + incx, dependencies); +} + +cl::sycl::event tbsv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, std::int64_t k, const double *a, + std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, + incx, dependencies); +} + +cl::sycl::event tbsv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, std::int64_t k, + const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, + incx, dependencies); +} + +cl::sycl::event tbsv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, std::int64_t k, + const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::tbsv(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, + incx, dependencies); +} + +cl::sycl::event tpmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const float *a, float *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx, + dependencies); +} + +cl::sycl::event tpmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const double *a, double *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx, + dependencies); +} + +cl::sycl::event tpmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const std::complex *a, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx, + dependencies); +} + +cl::sycl::event tpmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const std::complex *a, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::tpmv(queue, upper_lower, trans, unit_diag, n, a, x, incx, + dependencies); +} + +cl::sycl::event tpsv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const float *a, float *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx, + dependencies); +} + +cl::sycl::event tpsv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const double *a, double *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx, + dependencies); +} + +cl::sycl::event tpsv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const std::complex *a, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx, + dependencies); +} + +cl::sycl::event tpsv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const std::complex *a, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::tpsv(queue, upper_lower, trans, unit_diag, n, a, x, incx, + dependencies); +} + +cl::sycl::event trmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, + float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); +} + +cl::sycl::event trmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, + double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); +} + +cl::sycl::event trmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); +} + +cl::sycl::event trmv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::trmv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); +} + +cl::sycl::event trsv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, + float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); +} + +cl::sycl::event trsv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, + double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); +} + +cl::sycl::event trsv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); +} + +cl::sycl::event trsv(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t n, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::trsv(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, + dependencies); +} + +cl::sycl::event dotc(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::dotc(queue, n, x, incx, y, incy, result, dependencies); +} + +cl::sycl::event dotc(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::dotc(queue, n, x, incx, y, incy, result, dependencies); +} + +cl::sycl::event dotu(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::dotu(queue, n, x, incx, y, incy, result, dependencies); +} + +cl::sycl::event dotu(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::dotu(queue, n, x, incx, y, incy, result, dependencies); +} + +cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + std::int64_t *result, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::iamax(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + std::int64_t *result, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::iamax(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::iamax(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::iamax(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + std::int64_t *result, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::iamin(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + std::int64_t *result, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::iamin(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::iamin(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::iamin(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::asum(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::asum(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + float *result, const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::asum(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + double *result, const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::asum(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, float alpha, const float *x, + std::int64_t incx, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::axpy(queue, n, alpha, x, incx, y, incy, dependencies); +} + +cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, double alpha, const double *x, + std::int64_t incx, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::axpy(queue, n, alpha, x, incx, y, incy, dependencies); +} + +cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::axpy(queue, n, alpha, x, incx, y, incy, dependencies); +} + +cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::axpy(queue, n, alpha, x, incx, y, incy, dependencies); +} + +cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, float *alpha, const float **x, + std::int64_t *incx, float **y, std::int64_t *incy, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count, + group_size, dependencies); +} + +cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, double *alpha, const double **x, + std::int64_t *incx, double **y, std::int64_t *incy, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count, + group_size, dependencies); +} + +cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, std::complex *alpha, + const std::complex **x, std::int64_t *incx, + std::complex **y, std::int64_t *incy, std::int64_t group_count, + std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count, + group_size, dependencies); +} + +cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, std::complex *alpha, + const std::complex **x, std::int64_t *incx, + std::complex **y, std::int64_t *incy, std::int64_t group_count, + std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count, + group_size, dependencies); +} + +cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::copy(queue, n, x, incx, y, incy, dependencies); +} + +cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::copy(queue, n, x, incx, y, incy, dependencies); +} + +cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::copy(queue, n, x, incx, y, incy, dependencies); +} + +cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::copy(queue, n, x, incx, y, incy, dependencies); +} + +cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + const float *y, std::int64_t incy, float *result, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::dot(queue, n, x, incx, y, incy, result, dependencies); +} + +cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + const double *y, std::int64_t incy, double *result, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::dot(queue, n, x, incx, y, incy, result, dependencies); +} + +cl::sycl::event sdsdot(cl::sycl::queue &queue, std::int64_t n, float sb, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, float *result, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::sdsdot(queue, n, sb, x, incx, y, incy, result, dependencies); +} + +cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + const float *y, std::int64_t incy, double *result, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::dot(queue, n, x, incx, y, incy, result, dependencies); +} + +cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::nrm2(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::nrm2(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + float *result, const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::nrm2(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + double *result, const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::nrm2(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, float c, float s, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::rot(queue, n, x, incx, y, incy, c, s, dependencies); +} + +cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, double c, + double s, const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::rot(queue, n, x, incx, y, incy, c, s, dependencies); +} + +cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, float c, float s, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::rot(queue, n, x, incx, y, incy, c, s, dependencies); +} + +cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, + std::int64_t incy, double c, double s, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::rot(queue, n, x, incx, y, incy, c, s, dependencies); +} + +cl::sycl::event rotg(cl::sycl::queue &queue, float *a, float *b, float *c, float *s, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::rotg(queue, a, b, c, s, dependencies); +} + +cl::sycl::event rotg(cl::sycl::queue &queue, double *a, double *b, double *c, double *s, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::rotg(queue, a, b, c, s, dependencies); +} + +cl::sycl::event rotg(cl::sycl::queue &queue, std::complex *a, std::complex *b, + float *c, std::complex *s, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::rotg(queue, a, b, c, s, dependencies); +} + +cl::sycl::event rotg(cl::sycl::queue &queue, std::complex *a, std::complex *b, + double *c, std::complex *s, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::rotg(queue, a, b, c, s, dependencies); +} + +cl::sycl::event rotm(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, float *param, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::rotm(queue, n, x, incx, y, incy, param, dependencies); +} + +cl::sycl::event rotm(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, + double *y, std::int64_t incy, double *param, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::rotm(queue, n, x, incx, y, incy, param, dependencies); +} + +cl::sycl::event rotmg(cl::sycl::queue &queue, float *d1, float *d2, float *x1, float y1, + float *param, const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::rotmg(queue, d1, d2, x1, y1, param, dependencies); +} + +cl::sycl::event rotmg(cl::sycl::queue &queue, double *d1, double *d2, double *x1, double y1, + double *param, const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::rotmg(queue, d1, d2, x1, y1, param, dependencies); +} + +cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, float alpha, float *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::scal(queue, n, alpha, x, incx, dependencies); +} + +cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, double alpha, double *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::scal(queue, n, alpha, x, incx, dependencies); +} + +cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::scal(queue, n, alpha, x, incx, dependencies); +} + +cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::scal(queue, n, alpha, x, incx, dependencies); +} + +cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, float alpha, std::complex *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::scal(queue, n, alpha, x, incx, dependencies); +} + +cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, double alpha, std::complex *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::scal(queue, n, alpha, x, incx, dependencies); +} + +cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::swap(queue, n, x, incx, y, incy, dependencies); +} + +cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, + double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::swap(queue, n, x, incx, y, incy, dependencies); +} + +cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::swap(queue, n, x, incx, y, incy, dependencies); +} + +cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::swap(queue, n, x, incx, y, incy, dependencies); +} + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose *transa, + onemkl::transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, float *alpha, const float **a, std::int64_t *lda, + const float **b, std::int64_t *ldb, float *beta, float **c, + std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, + ldb, beta, c, ldc, group_count, group_size, + dependencies); +} + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose *transa, + onemkl::transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, double *alpha, const double **a, std::int64_t *lda, + const double **b, std::int64_t *ldb, double *beta, double **c, + std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, + ldb, beta, c, ldc, group_count, group_size, + dependencies); +} + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose *transa, + onemkl::transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, std::complex *alpha, + const std::complex **a, std::int64_t *lda, + const std::complex **b, std::int64_t *ldb, + std::complex *beta, std::complex **c, std::int64_t *ldc, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, + ldb, beta, c, ldc, group_count, group_size, + dependencies); +} + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose *transa, + onemkl::transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, std::complex *alpha, + const std::complex **a, std::int64_t *lda, + const std::complex **b, std::int64_t *ldb, + std::complex *beta, std::complex **c, std::int64_t *ldc, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, + ldb, beta, c, ldc, group_count, group_size, + dependencies); +} + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa, + onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + float alpha, const float *a, std::int64_t lda, std::int64_t stride_a, + const float *b, std::int64_t ldb, std::int64_t stride_b, float beta, + float *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, + stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, + batch_size, dependencies); +} + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa, + onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + double alpha, const double *a, std::int64_t lda, std::int64_t stride_a, + const double *b, std::int64_t ldb, std::int64_t stride_b, double beta, + double *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, + stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, + batch_size, dependencies); +} + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa, + onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, + std::int64_t lda, std::int64_t stride_a, const std::complex *b, + std::int64_t ldb, std::int64_t stride_b, std::complex beta, + std::complex *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, + stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, + batch_size, dependencies); +} + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa, + onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, + std::int64_t lda, std::int64_t stride_a, const std::complex *b, + std::int64_t ldb, std::int64_t stride_b, std::complex beta, + std::complex *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, + stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, + batch_size, dependencies); +} + +cl::sycl::event gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa, + onemkl::transpose transb, std::int64_t n, std::int64_t k, float alpha, + const float *a, std::int64_t lda, const float *b, std::int64_t ldb, + float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, + b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa, + onemkl::transpose transb, std::int64_t n, std::int64_t k, double alpha, + const double *a, std::int64_t lda, const double *b, std::int64_t ldb, + double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, + b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa, + onemkl::transpose transb, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, + b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa, + onemkl::transpose transb, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return onemkl::mklgpu::internal::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, + b, ldb, beta, c, ldc, dependencies); +} + +} // namespace mklgpu +} // namespace onemkl diff --git a/src/blas/backends/mklgpu/mkl_internal_blas_gpu_wrappers.cpp b/src/blas/backends/mklgpu/mkl_internal_blas_gpu_wrappers.cpp index 2181a4aba..ea02a5cc1 100644 --- a/src/blas/backends/mklgpu/mkl_internal_blas_gpu_wrappers.cpp +++ b/src/blas/backends/mklgpu/mkl_internal_blas_gpu_wrappers.cpp @@ -18,6 +18,7 @@ *******************************************************************************/ #include +#include #include "include/allocator_helper.hpp" #include "mkl_internal_blas_gpu_wrappers.hpp" @@ -27,833 +28,860 @@ namespace onemkl { namespace mklgpu { namespace internal { -void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, int64_t m, - int64_t n, int64_t k, float alpha, cl::sycl::buffer &a, int64_t lda, - cl::sycl::buffer &b, int64_t ldb, float beta, cl::sycl::buffer &c, - int64_t ldc) { +// Buffer APIs + +void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &b, + std::int64_t ldb, float beta, cl::sycl::buffer &c, std::int64_t ldc) { mkl::gpu::sgemm(queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, int64_t m, - int64_t n, int64_t k, double alpha, cl::sycl::buffer &a, int64_t lda, - cl::sycl::buffer &b, int64_t ldb, double beta, cl::sycl::buffer &c, - int64_t ldc) { +void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, + cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &b, + std::int64_t ldb, double beta, cl::sycl::buffer &c, std::int64_t ldc) { mkl::gpu::dgemm(queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, int64_t m, - int64_t n, int64_t k, std::complex alpha, - cl::sycl::buffer, 1> &a, int64_t lda, - cl::sycl::buffer, 1> &b, int64_t ldb, std::complex beta, - cl::sycl::buffer, 1> &c, int64_t ldc) { +void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, + cl::sycl::buffer, 1> &c, std::int64_t ldc) { mkl::gpu::cgemm(queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, int64_t m, - int64_t n, int64_t k, std::complex alpha, - cl::sycl::buffer, 1> &a, int64_t lda, - cl::sycl::buffer, 1> &b, int64_t ldb, std::complex beta, - cl::sycl::buffer, 1> &c, int64_t ldc) { +void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, + cl::sycl::buffer, 1> &c, std::int64_t ldc) { mkl::gpu::zgemm(queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, int64_t m, - int64_t n, float alpha, cl::sycl::buffer &a, int64_t lda, - cl::sycl::buffer &b, int64_t ldb, float beta, cl::sycl::buffer &c, - int64_t ldc) { +void symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m, + std::int64_t n, float alpha, cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &b, std::int64_t ldb, float beta, + cl::sycl::buffer &c, std::int64_t ldc) { mkl::gpu::ssymm(queue, mkl::cblas_convert(left_right), mkl::cblas_convert(upper_lower), m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -void symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, int64_t m, - int64_t n, double alpha, cl::sycl::buffer &a, int64_t lda, - cl::sycl::buffer &b, int64_t ldb, double beta, cl::sycl::buffer &c, - int64_t ldc) { +void symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m, + std::int64_t n, double alpha, cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &b, std::int64_t ldb, double beta, + cl::sycl::buffer &c, std::int64_t ldc) { mkl::gpu::dsymm(queue, mkl::cblas_convert(left_right), mkl::cblas_convert(upper_lower), m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -void symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, int64_t m, - int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, - int64_t lda, cl::sycl::buffer, 1> &b, int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, int64_t ldc) { +void symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m, + std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, + std::int64_t lda, cl::sycl::buffer, 1> &b, std::int64_t ldb, + std::complex beta, cl::sycl::buffer, 1> &c, std::int64_t ldc) { mkl::gpu::csymm(queue, mkl::cblas_convert(left_right), mkl::cblas_convert(upper_lower), m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -void symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, int64_t m, - int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, - int64_t lda, cl::sycl::buffer, 1> &b, int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, int64_t ldc) { +void symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m, + std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, + std::int64_t lda, cl::sycl::buffer, 1> &b, std::int64_t ldb, + std::complex beta, cl::sycl::buffer, 1> &c, + std::int64_t ldc) { mkl::gpu::zsymm(queue, mkl::cblas_convert(left_right), mkl::cblas_convert(upper_lower), m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -void hemm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, int64_t m, - int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, - int64_t lda, cl::sycl::buffer, 1> &b, int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, int64_t ldc) { +void hemm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m, + std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, + std::int64_t lda, cl::sycl::buffer, 1> &b, std::int64_t ldb, + std::complex beta, cl::sycl::buffer, 1> &c, std::int64_t ldc) { mkl::gpu::chemm(queue, mkl::cblas_convert(left_right), mkl::cblas_convert(upper_lower), m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -void hemm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, int64_t m, - int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, - int64_t lda, cl::sycl::buffer, 1> &b, int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, int64_t ldc) { +void hemm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m, + std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, + std::int64_t lda, cl::sycl::buffer, 1> &b, std::int64_t ldb, + std::complex beta, cl::sycl::buffer, 1> &c, + std::int64_t ldc) { mkl::gpu::zhemm(queue, mkl::cblas_convert(left_right), mkl::cblas_convert(upper_lower), m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -void syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, int64_t n, - int64_t k, float alpha, cl::sycl::buffer &a, int64_t lda, float beta, - cl::sycl::buffer &c, int64_t ldc) { +void syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n, + std::int64_t k, float alpha, cl::sycl::buffer &a, std::int64_t lda, float beta, + cl::sycl::buffer &c, std::int64_t ldc) { mkl::gpu::ssyrk(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), n, k, alpha, a, lda, beta, c, ldc); } -void syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, int64_t n, - int64_t k, double alpha, cl::sycl::buffer &a, int64_t lda, double beta, - cl::sycl::buffer &c, int64_t ldc) { +void syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n, + std::int64_t k, double alpha, cl::sycl::buffer &a, std::int64_t lda, + double beta, cl::sycl::buffer &c, std::int64_t ldc) { mkl::gpu::dsyrk(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), n, k, alpha, a, lda, beta, c, ldc); } -void syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, int64_t n, - int64_t k, std::complex alpha, cl::sycl::buffer, 1> &a, - int64_t lda, std::complex beta, cl::sycl::buffer, 1> &c, - int64_t ldc) { +void syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, cl::sycl::buffer, 1> &a, + std::int64_t lda, std::complex beta, cl::sycl::buffer, 1> &c, + std::int64_t ldc) { mkl::gpu::csyrk(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), n, k, alpha, a, lda, beta, c, ldc); } -void syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, int64_t n, - int64_t k, std::complex alpha, cl::sycl::buffer, 1> &a, - int64_t lda, std::complex beta, cl::sycl::buffer, 1> &c, - int64_t ldc) { +void syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, cl::sycl::buffer, 1> &a, + std::int64_t lda, std::complex beta, cl::sycl::buffer, 1> &c, + std::int64_t ldc) { mkl::gpu::zsyrk(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), n, k, alpha, a, lda, beta, c, ldc); } -void herk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, int64_t n, - int64_t k, float alpha, cl::sycl::buffer, 1> &a, int64_t lda, - float beta, cl::sycl::buffer, 1> &c, int64_t ldc) { +void herk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n, + std::int64_t k, float alpha, cl::sycl::buffer, 1> &a, + std::int64_t lda, float beta, cl::sycl::buffer, 1> &c, + std::int64_t ldc) { mkl::gpu::cherk(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), n, k, alpha, a, lda, beta, c, ldc); } -void herk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, int64_t n, - int64_t k, double alpha, cl::sycl::buffer, 1> &a, int64_t lda, - double beta, cl::sycl::buffer, 1> &c, int64_t ldc) { +void herk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n, + std::int64_t k, double alpha, cl::sycl::buffer, 1> &a, + std::int64_t lda, double beta, cl::sycl::buffer, 1> &c, + std::int64_t ldc) { mkl::gpu::zherk(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), n, k, alpha, a, lda, beta, c, ldc); } -void syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, int64_t n, - int64_t k, float alpha, cl::sycl::buffer &a, int64_t lda, - cl::sycl::buffer &b, int64_t ldb, float beta, cl::sycl::buffer &c, - int64_t ldc) { +void syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer &a, + std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, float beta, + cl::sycl::buffer &c, std::int64_t ldc) { mkl::gpu::ssyr2k(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, int64_t n, - int64_t k, double alpha, cl::sycl::buffer &a, int64_t lda, - cl::sycl::buffer &b, int64_t ldb, double beta, cl::sycl::buffer &c, - int64_t ldc) { +void syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, double alpha, cl::sycl::buffer &a, + std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, double beta, + cl::sycl::buffer &c, std::int64_t ldc) { mkl::gpu::dsyr2k(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, int64_t n, - int64_t k, std::complex alpha, cl::sycl::buffer, 1> &a, - int64_t lda, cl::sycl::buffer, 1> &b, int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, int64_t ldc) { +void syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, + cl::sycl::buffer, 1> &c, std::int64_t ldc) { mkl::gpu::csyr2k(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, int64_t n, - int64_t k, std::complex alpha, cl::sycl::buffer, 1> &a, - int64_t lda, cl::sycl::buffer, 1> &b, int64_t ldb, - std::complex beta, cl::sycl::buffer, 1> &c, int64_t ldc) { +void syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &b, std::int64_t ldb, + std::complex beta, cl::sycl::buffer, 1> &c, + std::int64_t ldc) { mkl::gpu::zsyr2k(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void her2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, int64_t n, - int64_t k, std::complex alpha, cl::sycl::buffer, 1> &a, - int64_t lda, cl::sycl::buffer, 1> &b, int64_t ldb, float beta, - cl::sycl::buffer, 1> &c, int64_t ldc) { +void her2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &b, std::int64_t ldb, float beta, + cl::sycl::buffer, 1> &c, std::int64_t ldc) { mkl::gpu::cher2k(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void her2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, int64_t n, - int64_t k, std::complex alpha, cl::sycl::buffer, 1> &a, - int64_t lda, cl::sycl::buffer, 1> &b, int64_t ldb, double beta, - cl::sycl::buffer, 1> &c, int64_t ldc) { +void her2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &b, std::int64_t ldb, double beta, + cl::sycl::buffer, 1> &c, std::int64_t ldc) { mkl::gpu::zher2k(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, - onemkl::transpose transa, onemkl::diag unit_diag, int64_t m, int64_t n, float alpha, - cl::sycl::buffer &a, int64_t lda, cl::sycl::buffer &b, int64_t ldb) { + onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, + float alpha, cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &b, std::int64_t ldb) { mkl::gpu::strmm(queue, mkl::cblas_convert(left_right), mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa), mkl::cblas_convert(unit_diag), m, n, alpha, a, lda, b, ldb); } void trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, - onemkl::transpose transa, onemkl::diag unit_diag, int64_t m, int64_t n, double alpha, - cl::sycl::buffer &a, int64_t lda, cl::sycl::buffer &b, - int64_t ldb) { + onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, + double alpha, cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &b, std::int64_t ldb) { mkl::gpu::dtrmm(queue, mkl::cblas_convert(left_right), mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa), mkl::cblas_convert(unit_diag), m, n, alpha, a, lda, b, ldb); } void trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, - onemkl::transpose transa, onemkl::diag unit_diag, int64_t m, int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &a, int64_t lda, - cl::sycl::buffer, 1> &b, int64_t ldb) { + onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &b, std::int64_t ldb) { mkl::gpu::ctrmm(queue, mkl::cblas_convert(left_right), mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa), mkl::cblas_convert(unit_diag), m, n, alpha, a, lda, b, ldb); } void trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, - onemkl::transpose transa, onemkl::diag unit_diag, int64_t m, int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &a, int64_t lda, - cl::sycl::buffer, 1> &b, int64_t ldb) { + onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, cl::sycl::buffer, 1> &a, + std::int64_t lda, cl::sycl::buffer, 1> &b, std::int64_t ldb) { mkl::gpu::ztrmm(queue, mkl::cblas_convert(left_right), mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa), mkl::cblas_convert(unit_diag), m, n, alpha, a, lda, b, ldb); } void trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, - onemkl::transpose transa, onemkl::diag unit_diag, int64_t m, int64_t n, float alpha, - cl::sycl::buffer &a, int64_t lda, cl::sycl::buffer &b, int64_t ldb) { + onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, + float alpha, cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &b, std::int64_t ldb) { mkl::gpu::strsm(queue, mkl::cblas_convert(left_right), mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa), mkl::cblas_convert(unit_diag), m, n, alpha, a, lda, b, ldb); } void trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, - onemkl::transpose transa, onemkl::diag unit_diag, int64_t m, int64_t n, double alpha, - cl::sycl::buffer &a, int64_t lda, cl::sycl::buffer &b, - int64_t ldb) { + onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, + double alpha, cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &b, std::int64_t ldb) { mkl::gpu::dtrsm(queue, mkl::cblas_convert(left_right), mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa), mkl::cblas_convert(unit_diag), m, n, alpha, a, lda, b, ldb); } void trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, - onemkl::transpose transa, onemkl::diag unit_diag, int64_t m, int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &a, int64_t lda, - cl::sycl::buffer, 1> &b, int64_t ldb) { + onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &b, std::int64_t ldb) { mkl::gpu::ctrsm(queue, mkl::cblas_convert(left_right), mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa), mkl::cblas_convert(unit_diag), m, n, alpha, a, lda, b, ldb); } void trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, - onemkl::transpose transa, onemkl::diag unit_diag, int64_t m, int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &a, int64_t lda, - cl::sycl::buffer, 1> &b, int64_t ldb) { + onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, cl::sycl::buffer, 1> &a, + std::int64_t lda, cl::sycl::buffer, 1> &b, std::int64_t ldb) { mkl::gpu::ztrsm(queue, mkl::cblas_convert(left_right), mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa), mkl::cblas_convert(unit_diag), m, n, alpha, a, lda, b, ldb); } -void gemv(cl::sycl::queue &queue, onemkl::transpose trans, int64_t m, int64_t n, float alpha, - cl::sycl::buffer &a, int64_t lda, cl::sycl::buffer &x, int64_t incx, - float beta, cl::sycl::buffer &y, int64_t incy) { +void gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n, + float alpha, cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &x, std::int64_t incx, float beta, + cl::sycl::buffer &y, std::int64_t incy) { mkl::gpu::sgemv(queue, mkl::cblas_convert(trans), m, n, alpha, a, lda, x, incx, beta, y, incy); } -void gemv(cl::sycl::queue &queue, onemkl::transpose trans, int64_t m, int64_t n, double alpha, - cl::sycl::buffer &a, int64_t lda, cl::sycl::buffer &x, int64_t incx, - double beta, cl::sycl::buffer &y, int64_t incy) { +void gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n, + double alpha, cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &x, std::int64_t incx, double beta, + cl::sycl::buffer &y, std::int64_t incy) { mkl::gpu::dgemv(queue, mkl::cblas_convert(trans), m, n, alpha, a, lda, x, incx, beta, y, incy); } -void gemv(cl::sycl::queue &queue, onemkl::transpose trans, int64_t m, int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &a, int64_t lda, - cl::sycl::buffer, 1> &x, int64_t incx, std::complex beta, - cl::sycl::buffer, 1> &y, int64_t incy) { +void gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, + cl::sycl::buffer, 1> &y, std::int64_t incy) { mkl::gpu::cgemv(queue, mkl::cblas_convert(trans), m, n, alpha, a, lda, x, incx, beta, y, incy); } -void gemv(cl::sycl::queue &queue, onemkl::transpose trans, int64_t m, int64_t n, - std::complex alpha, cl::sycl::buffer, 1> &a, int64_t lda, - cl::sycl::buffer, 1> &x, int64_t incx, std::complex beta, - cl::sycl::buffer, 1> &y, int64_t incy) { +void gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, cl::sycl::buffer, 1> &a, + std::int64_t lda, cl::sycl::buffer, 1> &x, std::int64_t incx, + std::complex beta, cl::sycl::buffer, 1> &y, + std::int64_t incy) { mkl::gpu::zgemv(queue, mkl::cblas_convert(trans), m, n, alpha, a, lda, x, incx, beta, y, incy); } -void gbmv(cl::sycl::queue &queue, onemkl::transpose trans, int64_t m, int64_t n, int64_t kl, - int64_t ku, float alpha, cl::sycl::buffer &a, int64_t lda, - cl::sycl::buffer &x, int64_t incx, float beta, cl::sycl::buffer &y, - int64_t incy) { +void gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, float alpha, cl::sycl::buffer &a, + std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx, float beta, + cl::sycl::buffer &y, std::int64_t incy) { mkl::gpu::sgbmv(queue, mkl::cblas_convert(trans), m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -void gbmv(cl::sycl::queue &queue, onemkl::transpose trans, int64_t m, int64_t n, int64_t kl, - int64_t ku, double alpha, cl::sycl::buffer &a, int64_t lda, - cl::sycl::buffer &x, int64_t incx, double beta, cl::sycl::buffer &y, - int64_t incy) { +void gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, double alpha, cl::sycl::buffer &a, + std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx, double beta, + cl::sycl::buffer &y, std::int64_t incy) { mkl::gpu::dgbmv(queue, mkl::cblas_convert(trans), m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -void gbmv(cl::sycl::queue &queue, onemkl::transpose trans, int64_t m, int64_t n, int64_t kl, - int64_t ku, std::complex alpha, cl::sycl::buffer, 1> &a, - int64_t lda, cl::sycl::buffer, 1> &x, int64_t incx, - std::complex beta, cl::sycl::buffer, 1> &y, int64_t incy) { +void gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, + cl::sycl::buffer, 1> &y, std::int64_t incy) { mkl::gpu::cgbmv(queue, mkl::cblas_convert(trans), m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -void gbmv(cl::sycl::queue &queue, onemkl::transpose trans, int64_t m, int64_t n, int64_t kl, - int64_t ku, std::complex alpha, cl::sycl::buffer, 1> &a, - int64_t lda, cl::sycl::buffer, 1> &x, int64_t incx, - std::complex beta, cl::sycl::buffer, 1> &y, int64_t incy) { +void gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &x, std::int64_t incx, + std::complex beta, cl::sycl::buffer, 1> &y, + std::int64_t incy) { mkl::gpu::zgbmv(queue, mkl::cblas_convert(trans), m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -void ger(cl::sycl::queue &queue, int64_t m, int64_t n, float alpha, cl::sycl::buffer &x, - int64_t incx, cl::sycl::buffer &y, int64_t incy, cl::sycl::buffer &a, - int64_t lda) { +void ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, + cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &y, + std::int64_t incy, cl::sycl::buffer &a, std::int64_t lda) { mkl::gpu::sger(queue, m, n, alpha, x, incx, y, incy, a, lda); } -void ger(cl::sycl::queue &queue, int64_t m, int64_t n, double alpha, cl::sycl::buffer &x, - int64_t incx, cl::sycl::buffer &y, int64_t incy, cl::sycl::buffer &a, - int64_t lda) { +void ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, + cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &y, + std::int64_t incy, cl::sycl::buffer &a, std::int64_t lda) { mkl::gpu::dger(queue, m, n, alpha, x, incx, y, incy, a, lda); } -void gerc(cl::sycl::queue &queue, int64_t m, int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &x, int64_t incx, - cl::sycl::buffer, 1> &y, int64_t incy, - cl::sycl::buffer, 1> &a, int64_t lda) { +void gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer, 1> &y, std::int64_t incy, + cl::sycl::buffer, 1> &a, std::int64_t lda) { mkl::gpu::cgerc(queue, m, n, alpha, x, incx, y, incy, a, lda); } -void gerc(cl::sycl::queue &queue, int64_t m, int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &x, int64_t incx, - cl::sycl::buffer, 1> &y, int64_t incy, - cl::sycl::buffer, 1> &a, int64_t lda) { +void gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer, 1> &y, std::int64_t incy, + cl::sycl::buffer, 1> &a, std::int64_t lda) { mkl::gpu::zgerc(queue, m, n, alpha, x, incx, y, incy, a, lda); } -void geru(cl::sycl::queue &queue, int64_t m, int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &x, int64_t incx, - cl::sycl::buffer, 1> &y, int64_t incy, - cl::sycl::buffer, 1> &a, int64_t lda) { +void geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer, 1> &y, std::int64_t incy, + cl::sycl::buffer, 1> &a, std::int64_t lda) { mkl::gpu::cgeru(queue, m, n, alpha, x, incx, y, incy, a, lda); } -void geru(cl::sycl::queue &queue, int64_t m, int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &x, int64_t incx, - cl::sycl::buffer, 1> &y, int64_t incy, - cl::sycl::buffer, 1> &a, int64_t lda) { +void geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer, 1> &y, std::int64_t incy, + cl::sycl::buffer, 1> &a, std::int64_t lda) { mkl::gpu::zgeru(queue, m, n, alpha, x, incx, y, incy, a, lda); } -void hbmv(cl::sycl::queue &queue, onemkl::uplo uplo, int64_t n, int64_t k, - std::complex alpha, cl::sycl::buffer, 1> &a, int64_t lda, - cl::sycl::buffer, 1> &x, int64_t incx, std::complex beta, - cl::sycl::buffer, 1> &y, int64_t incy) { +void hbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k, + std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, + cl::sycl::buffer, 1> &y, std::int64_t incy) { mkl::gpu::chbmv(queue, mkl::cblas_convert(uplo), n, k, alpha, a, lda, x, incx, beta, y, incy); } -void hbmv(cl::sycl::queue &queue, onemkl::uplo uplo, int64_t n, int64_t k, - std::complex alpha, cl::sycl::buffer, 1> &a, int64_t lda, - cl::sycl::buffer, 1> &x, int64_t incx, std::complex beta, - cl::sycl::buffer, 1> &y, int64_t incy) { +void hbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k, + std::complex alpha, cl::sycl::buffer, 1> &a, + std::int64_t lda, cl::sycl::buffer, 1> &x, std::int64_t incx, + std::complex beta, cl::sycl::buffer, 1> &y, + std::int64_t incy) { mkl::gpu::zhbmv(queue, mkl::cblas_convert(uplo), n, k, alpha, a, lda, x, incx, beta, y, incy); } -void hemv(cl::sycl::queue &queue, onemkl::uplo uplo, int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &a, int64_t lda, - cl::sycl::buffer, 1> &x, int64_t incx, std::complex beta, - cl::sycl::buffer, 1> &y, int64_t incy) { +void hemv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, + cl::sycl::buffer, 1> &y, std::int64_t incy) { mkl::gpu::chemv(queue, mkl::cblas_convert(uplo), n, alpha, a, lda, x, incx, beta, y, incy); } -void hemv(cl::sycl::queue &queue, onemkl::uplo uplo, int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &a, int64_t lda, - cl::sycl::buffer, 1> &x, int64_t incx, std::complex beta, - cl::sycl::buffer, 1> &y, int64_t incy) { +void hemv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &x, std::int64_t incx, + std::complex beta, cl::sycl::buffer, 1> &y, + std::int64_t incy) { mkl::gpu::zhemv(queue, mkl::cblas_convert(uplo), n, alpha, a, lda, x, incx, beta, y, incy); } -void her(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, float alpha, - cl::sycl::buffer, 1> &x, int64_t incx, - cl::sycl::buffer, 1> &a, int64_t lda) { +void her(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer, 1> &a, std::int64_t lda) { mkl::gpu::cher(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a, lda); } -void her(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, double alpha, - cl::sycl::buffer, 1> &x, int64_t incx, - cl::sycl::buffer, 1> &a, int64_t lda) { +void her(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer, 1> &a, std::int64_t lda) { mkl::gpu::zher(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a, lda); } -void her2(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &x, int64_t incx, - cl::sycl::buffer, 1> &y, int64_t incy, - cl::sycl::buffer, 1> &a, int64_t lda) { +void her2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, std::complex alpha, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer, 1> &y, std::int64_t incy, + cl::sycl::buffer, 1> &a, std::int64_t lda) { mkl::gpu::cher2(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a, lda); } -void her2(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &x, int64_t incx, - cl::sycl::buffer, 1> &y, int64_t incy, - cl::sycl::buffer, 1> &a, int64_t lda) { +void her2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, std::complex alpha, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer, 1> &y, std::int64_t incy, + cl::sycl::buffer, 1> &a, std::int64_t lda) { mkl::gpu::zher2(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a, lda); } -void hpmv(cl::sycl::queue &queue, onemkl::uplo uplo, int64_t n, std::complex alpha, +void hpmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, cl::sycl::buffer, 1> &x, - int64_t incx, std::complex beta, cl::sycl::buffer, 1> &y, - int64_t incy) { + std::int64_t incx, std::complex beta, cl::sycl::buffer, 1> &y, + std::int64_t incy) { mkl::gpu::chpmv(queue, mkl::cblas_convert(uplo), n, alpha, a, x, incx, beta, y, incy); } -void hpmv(cl::sycl::queue &queue, onemkl::uplo uplo, int64_t n, std::complex alpha, +void hpmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &x, int64_t incx, std::complex beta, - cl::sycl::buffer, 1> &y, int64_t incy) { + cl::sycl::buffer, 1> &x, std::int64_t incx, + std::complex beta, cl::sycl::buffer, 1> &y, + std::int64_t incy) { mkl::gpu::zhpmv(queue, mkl::cblas_convert(uplo), n, alpha, a, x, incx, beta, y, incy); } -void hpr(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, float alpha, - cl::sycl::buffer, 1> &x, int64_t incx, +void hpr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha, + cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer, 1> &a) { mkl::gpu::chpr(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a); } -void hpr(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, double alpha, - cl::sycl::buffer, 1> &x, int64_t incx, +void hpr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha, + cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer, 1> &a) { mkl::gpu::zhpr(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a); } -void hpr2(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &x, int64_t incx, - cl::sycl::buffer, 1> &y, int64_t incy, +void hpr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, std::complex alpha, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer, 1> &y, std::int64_t incy, cl::sycl::buffer, 1> &a) { mkl::gpu::chpr2(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a); } -void hpr2(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &x, int64_t incx, - cl::sycl::buffer, 1> &y, int64_t incy, +void hpr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, std::complex alpha, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer, 1> &y, std::int64_t incy, cl::sycl::buffer, 1> &a) { mkl::gpu::zhpr2(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a); } -void sbmv(cl::sycl::queue &queue, onemkl::uplo uplo, int64_t n, int64_t k, float alpha, - cl::sycl::buffer &a, int64_t lda, cl::sycl::buffer &x, int64_t incx, - float beta, cl::sycl::buffer &y, int64_t incy) { +void sbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k, float alpha, + cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &x, + std::int64_t incx, float beta, cl::sycl::buffer &y, std::int64_t incy) { mkl::gpu::ssbmv(queue, mkl::cblas_convert(uplo), n, k, alpha, a, lda, x, incx, beta, y, incy); } -void sbmv(cl::sycl::queue &queue, onemkl::uplo uplo, int64_t n, int64_t k, double alpha, - cl::sycl::buffer &a, int64_t lda, cl::sycl::buffer &x, int64_t incx, - double beta, cl::sycl::buffer &y, int64_t incy) { +void sbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k, double alpha, + cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &x, + std::int64_t incx, double beta, cl::sycl::buffer &y, std::int64_t incy) { mkl::gpu::dsbmv(queue, mkl::cblas_convert(uplo), n, k, alpha, a, lda, x, incx, beta, y, incy); } -void spmv(cl::sycl::queue &queue, onemkl::uplo uplo, int64_t n, float alpha, - cl::sycl::buffer &a, cl::sycl::buffer &x, int64_t incx, float beta, - cl::sycl::buffer &y, int64_t incy) { +void spmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, float alpha, + cl::sycl::buffer &a, cl::sycl::buffer &x, std::int64_t incx, + float beta, cl::sycl::buffer &y, std::int64_t incy) { mkl::gpu::sspmv(queue, mkl::cblas_convert(uplo), n, alpha, a, x, incx, beta, y, incy); } -void spmv(cl::sycl::queue &queue, onemkl::uplo uplo, int64_t n, double alpha, - cl::sycl::buffer &a, cl::sycl::buffer &x, int64_t incx, double beta, - cl::sycl::buffer &y, int64_t incy) { +void spmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, double alpha, + cl::sycl::buffer &a, cl::sycl::buffer &x, std::int64_t incx, + double beta, cl::sycl::buffer &y, std::int64_t incy) { mkl::gpu::dspmv(queue, mkl::cblas_convert(uplo), n, alpha, a, x, incx, beta, y, incy); } -void spr(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, float alpha, - cl::sycl::buffer &x, int64_t incx, cl::sycl::buffer &a) { +void spr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha, + cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &a) { mkl::gpu::sspr(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a); } -void spr(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, double alpha, - cl::sycl::buffer &x, int64_t incx, cl::sycl::buffer &a) { +void spr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha, + cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &a) { mkl::gpu::dspr(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a); } -void spr2(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, float alpha, - cl::sycl::buffer &x, int64_t incx, cl::sycl::buffer &y, int64_t incy, - cl::sycl::buffer &a) { +void spr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha, + cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &y, + std::int64_t incy, cl::sycl::buffer &a) { mkl::gpu::sspr2(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a); } -void spr2(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, double alpha, - cl::sycl::buffer &x, int64_t incx, cl::sycl::buffer &y, - int64_t incy, cl::sycl::buffer &a) { +void spr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha, + cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &y, + std::int64_t incy, cl::sycl::buffer &a) { mkl::gpu::dspr2(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a); } -void symv(cl::sycl::queue &queue, onemkl::uplo uplo, int64_t n, float alpha, - cl::sycl::buffer &a, int64_t lda, cl::sycl::buffer &x, int64_t incx, - float beta, cl::sycl::buffer &y, int64_t incy) { +void symv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, float alpha, + cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &x, + std::int64_t incx, float beta, cl::sycl::buffer &y, std::int64_t incy) { mkl::gpu::ssymv(queue, mkl::cblas_convert(uplo), n, alpha, a, lda, x, incx, beta, y, incy); } -void symv(cl::sycl::queue &queue, onemkl::uplo uplo, int64_t n, double alpha, - cl::sycl::buffer &a, int64_t lda, cl::sycl::buffer &x, int64_t incx, - double beta, cl::sycl::buffer &y, int64_t incy) { +void symv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, double alpha, + cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &x, + std::int64_t incx, double beta, cl::sycl::buffer &y, std::int64_t incy) { mkl::gpu::dsymv(queue, mkl::cblas_convert(uplo), n, alpha, a, lda, x, incx, beta, y, incy); } -void syr(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, float alpha, - cl::sycl::buffer &x, int64_t incx, cl::sycl::buffer &a, int64_t lda) { +void syr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha, + cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &a, + std::int64_t lda) { mkl::gpu::ssyr(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a, lda); } -void syr(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, double alpha, - cl::sycl::buffer &x, int64_t incx, cl::sycl::buffer &a, - int64_t lda) { +void syr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha, + cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &a, + std::int64_t lda) { mkl::gpu::dsyr(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a, lda); } -void syr2(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, float alpha, - cl::sycl::buffer &x, int64_t incx, cl::sycl::buffer &y, int64_t incy, - cl::sycl::buffer &a, int64_t lda) { +void syr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha, + cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &y, + std::int64_t incy, cl::sycl::buffer &a, std::int64_t lda) { mkl::gpu::ssyr2(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a, lda); } -void syr2(cl::sycl::queue &queue, onemkl::uplo upplo, int64_t n, double alpha, - cl::sycl::buffer &x, int64_t incx, cl::sycl::buffer &y, - int64_t incy, cl::sycl::buffer &a, int64_t lda) { +void syr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha, + cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &y, + std::int64_t incy, cl::sycl::buffer &a, std::int64_t lda) { mkl::gpu::dsyr2(queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a, lda); } void tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, - int64_t n, int64_t k, cl::sycl::buffer &a, int64_t lda, - cl::sycl::buffer &x, int64_t incx) { + std::int64_t n, std::int64_t k, cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &x, std::int64_t incx) { mkl::gpu::stbmv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), mkl::cblas_convert(diag), n, k, a, lda, x, incx); } void tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, - int64_t n, int64_t k, cl::sycl::buffer &a, int64_t lda, - cl::sycl::buffer &x, int64_t incx) { + std::int64_t n, std::int64_t k, cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &x, std::int64_t incx) { mkl::gpu::dtbmv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), mkl::cblas_convert(diag), n, k, a, lda, x, incx); } void tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, - int64_t n, int64_t k, cl::sycl::buffer, 1> &a, int64_t lda, - cl::sycl::buffer, 1> &x, int64_t incx) { + std::int64_t n, std::int64_t k, cl::sycl::buffer, 1> &a, + std::int64_t lda, cl::sycl::buffer, 1> &x, std::int64_t incx) { mkl::gpu::ctbmv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), mkl::cblas_convert(diag), n, k, a, lda, x, incx); } void tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, - int64_t n, int64_t k, cl::sycl::buffer, 1> &a, int64_t lda, - cl::sycl::buffer, 1> &x, int64_t incx) { + std::int64_t n, std::int64_t k, cl::sycl::buffer, 1> &a, + std::int64_t lda, cl::sycl::buffer, 1> &x, std::int64_t incx) { mkl::gpu::ztbmv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), mkl::cblas_convert(diag), n, k, a, lda, x, incx); } void tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, - int64_t n, int64_t k, cl::sycl::buffer &a, int64_t lda, - cl::sycl::buffer &x, int64_t incx) { + std::int64_t n, std::int64_t k, cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &x, std::int64_t incx) { mkl::gpu::stbsv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), mkl::cblas_convert(diag), n, k, a, lda, x, incx); } void tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, - int64_t n, int64_t k, cl::sycl::buffer &a, int64_t lda, - cl::sycl::buffer &x, int64_t incx) { + std::int64_t n, std::int64_t k, cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &x, std::int64_t incx) { mkl::gpu::dtbsv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), mkl::cblas_convert(diag), n, k, a, lda, x, incx); } void tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, - int64_t n, int64_t k, cl::sycl::buffer, 1> &a, int64_t lda, - cl::sycl::buffer, 1> &x, int64_t incx) { + std::int64_t n, std::int64_t k, cl::sycl::buffer, 1> &a, + std::int64_t lda, cl::sycl::buffer, 1> &x, std::int64_t incx) { mkl::gpu::ctbsv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), mkl::cblas_convert(diag), n, k, a, lda, x, incx); } void tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, - int64_t n, int64_t k, cl::sycl::buffer, 1> &a, int64_t lda, - cl::sycl::buffer, 1> &x, int64_t incx) { + std::int64_t n, std::int64_t k, cl::sycl::buffer, 1> &a, + std::int64_t lda, cl::sycl::buffer, 1> &x, std::int64_t incx) { mkl::gpu::ztbsv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), mkl::cblas_convert(diag), n, k, a, lda, x, incx); } void tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, - int64_t n, cl::sycl::buffer &a, cl::sycl::buffer &x, int64_t incx) { + std::int64_t n, cl::sycl::buffer &a, cl::sycl::buffer &x, + std::int64_t incx) { mkl::gpu::stpmv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), mkl::cblas_convert(diag), n, a, x, incx); } void tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, - int64_t n, cl::sycl::buffer &a, cl::sycl::buffer &x, int64_t incx) { + std::int64_t n, cl::sycl::buffer &a, cl::sycl::buffer &x, + std::int64_t incx) { mkl::gpu::dtpmv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), mkl::cblas_convert(diag), n, a, x, incx); } void tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, - int64_t n, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &x, int64_t incx) { + std::int64_t n, cl::sycl::buffer, 1> &a, + cl::sycl::buffer, 1> &x, std::int64_t incx) { mkl::gpu::ctpmv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), mkl::cblas_convert(diag), n, a, x, incx); } void tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, - int64_t n, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &x, int64_t incx) { + std::int64_t n, cl::sycl::buffer, 1> &a, + cl::sycl::buffer, 1> &x, std::int64_t incx) { mkl::gpu::ztpmv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), mkl::cblas_convert(diag), n, a, x, incx); } void tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, - int64_t n, cl::sycl::buffer &a, cl::sycl::buffer &x, int64_t incx) { + std::int64_t n, cl::sycl::buffer &a, cl::sycl::buffer &x, + std::int64_t incx) { mkl::gpu::stpsv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), mkl::cblas_convert(diag), n, a, x, incx); } void tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, - int64_t n, cl::sycl::buffer &a, cl::sycl::buffer &x, int64_t incx) { + std::int64_t n, cl::sycl::buffer &a, cl::sycl::buffer &x, + std::int64_t incx) { mkl::gpu::dtpsv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), mkl::cblas_convert(diag), n, a, x, incx); } void tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, - int64_t n, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &x, int64_t incx) { + std::int64_t n, cl::sycl::buffer, 1> &a, + cl::sycl::buffer, 1> &x, std::int64_t incx) { mkl::gpu::ctpsv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), mkl::cblas_convert(diag), n, a, x, incx); } void tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, - int64_t n, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &x, int64_t incx) { + std::int64_t n, cl::sycl::buffer, 1> &a, + cl::sycl::buffer, 1> &x, std::int64_t incx) { mkl::gpu::ztpsv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), mkl::cblas_convert(diag), n, a, x, incx); } void trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, - int64_t n, cl::sycl::buffer &a, int64_t lda, cl::sycl::buffer &x, - int64_t incx) { + std::int64_t n, cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &x, std::int64_t incx) { mkl::gpu::strmv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), mkl::cblas_convert(diag), n, a, lda, x, incx); } void trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, - int64_t n, cl::sycl::buffer &a, int64_t lda, cl::sycl::buffer &x, - int64_t incx) { + std::int64_t n, cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &x, std::int64_t incx) { mkl::gpu::dtrmv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), mkl::cblas_convert(diag), n, a, lda, x, incx); } void trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, - int64_t n, cl::sycl::buffer, 1> &a, int64_t lda, - cl::sycl::buffer, 1> &x, int64_t incx) { + std::int64_t n, cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &x, std::int64_t incx) { mkl::gpu::ctrmv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), mkl::cblas_convert(diag), n, a, lda, x, incx); } void trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, - int64_t n, cl::sycl::buffer, 1> &a, int64_t lda, - cl::sycl::buffer, 1> &x, int64_t incx) { + std::int64_t n, cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &x, std::int64_t incx) { mkl::gpu::ztrmv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), mkl::cblas_convert(diag), n, a, lda, x, incx); } void trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, - int64_t n, cl::sycl::buffer &a, int64_t lda, cl::sycl::buffer &x, - int64_t incx) { + std::int64_t n, cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &x, std::int64_t incx) { mkl::gpu::strsv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), mkl::cblas_convert(diag), n, a, lda, x, incx); } void trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, - int64_t n, cl::sycl::buffer &a, int64_t lda, cl::sycl::buffer &x, - int64_t incx) { + std::int64_t n, cl::sycl::buffer &a, std::int64_t lda, + cl::sycl::buffer &x, std::int64_t incx) { mkl::gpu::dtrsv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), mkl::cblas_convert(diag), n, a, lda, x, incx); } void trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, - int64_t n, cl::sycl::buffer, 1> &a, int64_t lda, - cl::sycl::buffer, 1> &x, int64_t incx) { + std::int64_t n, cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &x, std::int64_t incx) { mkl::gpu::ctrsv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), mkl::cblas_convert(diag), n, a, lda, x, incx); } void trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, - int64_t n, cl::sycl::buffer, 1> &a, int64_t lda, - cl::sycl::buffer, 1> &x, int64_t incx) { + std::int64_t n, cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &x, std::int64_t incx) { mkl::gpu::ztrsv(queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), mkl::cblas_convert(diag), n, a, lda, x, incx); } -void asum(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer, 1> &x, - int64_t incx, cl::sycl::buffer &result) { +void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, + std::int64_t incx, cl::sycl::buffer &result) { mkl::gpu::scasum(queue, n, x, incx, result); } -void asum(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer, 1> &x, - int64_t incx, cl::sycl::buffer &result) { +void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, + std::int64_t incx, cl::sycl::buffer &result) { mkl::gpu::dzasum(queue, n, x, incx, result); } -void asum(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, int64_t incx, +void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &result) { mkl::gpu::sasum(queue, n, x, incx, result); } -void asum(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, int64_t incx, +void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &result) { mkl::gpu::dasum(queue, n, x, incx, result); } -void axpy(cl::sycl::queue &queue, int64_t n, float alpha, cl::sycl::buffer &x, - int64_t incx, cl::sycl::buffer &y, int64_t incy) { +void axpy(cl::sycl::queue &queue, std::int64_t n, float alpha, cl::sycl::buffer &x, + std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy) { mkl::gpu::saxpy(queue, n, alpha, x, incx, y, incy); } -void axpy(cl::sycl::queue &queue, int64_t n, double alpha, cl::sycl::buffer &x, - int64_t incx, cl::sycl::buffer &y, int64_t incy) { +void axpy(cl::sycl::queue &queue, std::int64_t n, double alpha, cl::sycl::buffer &x, + std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy) { mkl::gpu::daxpy(queue, n, alpha, x, incx, y, incy); } -void axpy(cl::sycl::queue &queue, int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &x, int64_t incx, - cl::sycl::buffer, 1> &y, int64_t incy) { +void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer, 1> &y, std::int64_t incy) { mkl::gpu::caxpy(queue, n, alpha, x, incx, y, incy); } -void axpy(cl::sycl::queue &queue, int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &x, int64_t incx, - cl::sycl::buffer, 1> &y, int64_t incy) { +void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + cl::sycl::buffer, 1> &x, std::int64_t incx, + cl::sycl::buffer, 1> &y, std::int64_t incy) { mkl::gpu::zaxpy(queue, n, alpha, x, incx, y, incy); } -void copy(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, int64_t incx, - cl::sycl::buffer &y, int64_t incy) { +void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, + cl::sycl::buffer &y, std::int64_t incy) { mkl::gpu::scopy(queue, n, x, incx, y, incy); } -void copy(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, int64_t incx, - cl::sycl::buffer &y, int64_t incy) { +void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, + cl::sycl::buffer &y, std::int64_t incy) { mkl::gpu::dcopy(queue, n, x, incx, y, incy); } -void copy(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer, 1> &x, - int64_t incx, cl::sycl::buffer, 1> &y, int64_t incy) { +void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, + std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy) { mkl::gpu::ccopy(queue, n, x, incx, y, incy); } -void copy(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer, 1> &x, - int64_t incx, cl::sycl::buffer, 1> &y, int64_t incy) { +void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, + std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy) { mkl::gpu::zcopy(queue, n, x, incx, y, incy); } -void dot(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, int64_t incx, - cl::sycl::buffer &y, int64_t incy, cl::sycl::buffer &result) { +void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, + cl::sycl::buffer &y, std::int64_t incy, cl::sycl::buffer &result) { mkl::gpu::sdot(queue, n, x, incx, y, incy, result); } -void dot(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, int64_t incx, - cl::sycl::buffer &y, int64_t incy, cl::sycl::buffer &result) { +void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, + cl::sycl::buffer &y, std::int64_t incy, cl::sycl::buffer &result) { mkl::gpu::ddot(queue, n, x, incx, y, incy, result); } -void sdsdot(cl::sycl::queue &queue, int64_t n, float sb, cl::sycl::buffer &x, - int64_t incx, cl::sycl::buffer &y, int64_t incy, +void sdsdot(cl::sycl::queue &queue, std::int64_t n, float sb, cl::sycl::buffer &x, + std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, cl::sycl::buffer &result) { mkl::gpu::sdsdot(queue, n, sb, x, incx, y, incy, result); } -void dot(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, int64_t incx, - cl::sycl::buffer &y, int64_t incy, cl::sycl::buffer &result) { +void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, + cl::sycl::buffer &y, std::int64_t incy, cl::sycl::buffer &result) { mkl::gpu::dsdot(queue, n, x, incx, y, incy, result); } -void dotc(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer, 1> &x, - int64_t incx, cl::sycl::buffer, 1> &y, int64_t incy, +void dotc(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, + std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy, cl::sycl::buffer, 1> &result) { mkl::gpu::cdotc(queue, n, x, incx, y, incy, result); } -void dotc(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer, 1> &x, - int64_t incx, cl::sycl::buffer, 1> &y, int64_t incy, +void dotc(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, + std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy, cl::sycl::buffer, 1> &result) { mkl::gpu::zdotc(queue, n, x, incx, y, incy, result); } -void dotu(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer, 1> &x, - int64_t incx, cl::sycl::buffer, 1> &y, int64_t incy, +void dotu(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, + std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy, cl::sycl::buffer, 1> &result) { mkl::gpu::cdotu(queue, n, x, incx, y, incy, result); } -void dotu(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer, 1> &x, - int64_t incx, cl::sycl::buffer, 1> &y, int64_t incy, +void dotu(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, + std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy, cl::sycl::buffer, 1> &result) { mkl::gpu::zdotu(queue, n, x, incx, y, incy, result); } -void nrm2(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer, 1> &x, - int64_t incx, cl::sycl::buffer &result) { +void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, + std::int64_t incx, cl::sycl::buffer &result) { mkl::gpu::scnrm2(queue, n, x, incx, result); } -void nrm2(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer, 1> &x, - int64_t incx, cl::sycl::buffer &result) { +void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, + std::int64_t incx, cl::sycl::buffer &result) { mkl::gpu::dznrm2(queue, n, x, incx, result); } -void nrm2(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, int64_t incx, +void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &result) { mkl::gpu::snrm2(queue, n, x, incx, result); } -void nrm2(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, int64_t incx, +void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &result) { mkl::gpu::dnrm2(queue, n, x, incx, result); } -void rot(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer, 1> &x, - int64_t incx, cl::sycl::buffer, 1> &y, int64_t incy, float c, +void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, + std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy, float c, float s) { mkl::gpu::csrot(queue, n, x, incx, y, incy, c, s); } -void rot(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer, 1> &x, - int64_t incx, cl::sycl::buffer, 1> &y, int64_t incy, double c, - double s) { +void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, + std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy, + double c, double s) { mkl::gpu::zdrot(queue, n, x, incx, y, incy, c, s); } -void rot(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, int64_t incx, - cl::sycl::buffer &y, int64_t incy, float c, float s) { +void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, + cl::sycl::buffer &y, std::int64_t incy, float c, float s) { mkl::gpu::srot(queue, n, x, incx, y, incy, c, s); } -void rot(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, int64_t incx, - cl::sycl::buffer &y, int64_t incy, double c, double s) { +void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, + cl::sycl::buffer &y, std::int64_t incy, double c, double s) { mkl::gpu::drot(queue, n, x, incx, y, incy, c, s); } @@ -879,13 +907,13 @@ void rotg(cl::sycl::queue &queue, cl::sycl::buffer, 1> &a, mkl::gpu::zrotg(queue, a, b, c, s); } -void rotm(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, int64_t incx, - cl::sycl::buffer &y, int64_t incy, cl::sycl::buffer ¶m) { +void rotm(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, + cl::sycl::buffer &y, std::int64_t incy, cl::sycl::buffer ¶m) { mkl::gpu::srotm(queue, n, x, incx, y, incy, param); } -void rotm(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, int64_t incx, - cl::sycl::buffer &y, int64_t incy, cl::sycl::buffer ¶m) { +void rotm(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, + cl::sycl::buffer &y, std::int64_t incy, cl::sycl::buffer ¶m) { mkl::gpu::drotm(queue, n, x, incx, y, incy, param); } @@ -899,245 +927,96 @@ void rotmg(cl::sycl::queue &queue, cl::sycl::buffer &d1, cl::sycl::bu mkl::gpu::drotmg(queue, d1, d2, x1, y1, param); } -void scal(cl::sycl::queue &queue, int64_t n, float alpha, cl::sycl::buffer &x, - int64_t incx) { +void scal(cl::sycl::queue &queue, std::int64_t n, float alpha, cl::sycl::buffer &x, + std::int64_t incx) { mkl::gpu::sscal(queue, n, alpha, x, incx); } -void scal(cl::sycl::queue &queue, int64_t n, double alpha, cl::sycl::buffer &x, - int64_t incx) { +void scal(cl::sycl::queue &queue, std::int64_t n, double alpha, cl::sycl::buffer &x, + std::int64_t incx) { mkl::gpu::dscal(queue, n, alpha, x, incx); } -void scal(cl::sycl::queue &queue, int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &x, int64_t incx) { +void scal(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + cl::sycl::buffer, 1> &x, std::int64_t incx) { mkl::gpu::cscal(queue, n, alpha, x, incx); } -void scal(cl::sycl::queue &queue, int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &x, int64_t incx) { +void scal(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + cl::sycl::buffer, 1> &x, std::int64_t incx) { mkl::gpu::zscal(queue, n, alpha, x, incx); } -void scal(cl::sycl::queue &queue, int64_t n, float alpha, - cl::sycl::buffer, 1> &x, int64_t incx) { +void scal(cl::sycl::queue &queue, std::int64_t n, float alpha, + cl::sycl::buffer, 1> &x, std::int64_t incx) { mkl::gpu::csscal(queue, n, alpha, x, incx); } -void scal(cl::sycl::queue &queue, int64_t n, double alpha, - cl::sycl::buffer, 1> &x, int64_t incx) { +void scal(cl::sycl::queue &queue, std::int64_t n, double alpha, + cl::sycl::buffer, 1> &x, std::int64_t incx) { mkl::gpu::zdscal(queue, n, alpha, x, incx); } -void swap(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, int64_t incx, - cl::sycl::buffer &y, int64_t incy) { +void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, + cl::sycl::buffer &y, std::int64_t incy) { mkl::gpu::sswap(queue, n, x, incx, y, incy); } -void swap(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, int64_t incx, - cl::sycl::buffer &y, int64_t incy) { +void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, + cl::sycl::buffer &y, std::int64_t incy) { mkl::gpu::dswap(queue, n, x, incx, y, incy); } -void swap(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer, 1> &x, - int64_t incx, cl::sycl::buffer, 1> &y, int64_t incy) { +void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, + std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy) { mkl::gpu::cswap(queue, n, x, incx, y, incy); } -void swap(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer, 1> &x, - int64_t incx, cl::sycl::buffer, 1> &y, int64_t incy) { +void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, + std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy) { mkl::gpu::zswap(queue, n, x, incx, y, incy); } -void iamax(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, int64_t incx, - cl::sycl::buffer &result) { +void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, + cl::sycl::buffer &result) { mkl::gpu::isamax(queue, n, x, incx, result); } -void iamax(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, int64_t incx, - cl::sycl::buffer &result) { +void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, + std::int64_t incx, cl::sycl::buffer &result) { mkl::gpu::idamax(queue, n, x, incx, result); } -void iamax(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer, 1> &x, - int64_t incx, cl::sycl::buffer &result) { +void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, + std::int64_t incx, cl::sycl::buffer &result) { mkl::gpu::icamax(queue, n, x, incx, result); } -void iamax(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer, 1> &x, - int64_t incx, cl::sycl::buffer &result) { +void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, + std::int64_t incx, cl::sycl::buffer &result) { mkl::gpu::izamax(queue, n, x, incx, result); } -void iamin(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, int64_t incx, - cl::sycl::buffer &result) { +void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, + cl::sycl::buffer &result) { mkl::gpu::isamin(queue, n, x, incx, result); } -void iamin(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, int64_t incx, - cl::sycl::buffer &result) { +void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, + std::int64_t incx, cl::sycl::buffer &result) { mkl::gpu::idamin(queue, n, x, incx, result); } -void iamin(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer, 1> &x, - int64_t incx, cl::sycl::buffer &result) { +void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, + std::int64_t incx, cl::sycl::buffer &result) { mkl::gpu::icamin(queue, n, x, incx, result); } -void iamin(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer, 1> &x, - int64_t incx, cl::sycl::buffer &result) { +void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, + std::int64_t incx, cl::sycl::buffer &result) { mkl::gpu::izamin(queue, n, x, incx, result); } -void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, - cl::sycl::buffer &m, cl::sycl::buffer &n, - cl::sycl::buffer &k, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, - cl::sycl::buffer &beta, cl::sycl::buffer &c, - cl::sycl::buffer &ldc, std::int64_t group_count, - cl::sycl::buffer &group_size) { - auto transa_acc = transa.get_access(); - auto transb_acc = transb.get_access(); - auto m_acc = m.get_access(); - auto n_acc = n.get_access(); - auto k_acc = k.get_access(); - auto alpha_acc = alpha.get_access(); - auto lda_acc = lda.get_access(); - auto ldb_acc = ldb.get_access(); - auto beta_acc = beta.get_access(); - auto ldc_acc = ldc.get_access(); - auto group_size_acc = group_size.get_access(); - int64_t stride_a, stride_b, stride_c, off_a = 0, off_b = 0, off_c = 0; - for (int64_t i = 0; i < group_count; i++) { - stride_a = - (transa_acc[i] == transpose::nontrans) ? lda_acc[i] * k_acc[i] : lda_acc[i] * m_acc[i]; - stride_b = - (transb_acc[i] == transpose::nontrans) ? ldb_acc[i] * n_acc[i] : ldb_acc[i] * k_acc[i]; - stride_c = ldc_acc[i] * n_acc[i]; - mkl::gpu::sgemm_batch( - queue, mkl::cblas_convert(transa_acc[i]), mkl::cblas_convert(transb_acc[i]), m_acc[i], - n_acc[i], k_acc[i], alpha_acc[i], a, lda_acc[i], stride_a, b, ldb_acc[i], stride_b, - beta_acc[i], c, ldc_acc[i], stride_c, group_size_acc[i], off_a, off_b, off_c); - off_a += stride_a * group_size_acc[i]; - off_b += stride_b * group_size_acc[i]; - off_c += stride_c * group_size_acc[i]; - } -} - -void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, - cl::sycl::buffer &m, cl::sycl::buffer &n, - cl::sycl::buffer &k, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, - cl::sycl::buffer &beta, cl::sycl::buffer &c, - cl::sycl::buffer &ldc, std::int64_t group_count, - cl::sycl::buffer &group_size) { - auto transa_acc = transa.get_access(); - auto transb_acc = transb.get_access(); - auto m_acc = m.get_access(); - auto n_acc = n.get_access(); - auto k_acc = k.get_access(); - auto alpha_acc = alpha.get_access(); - auto lda_acc = lda.get_access(); - auto ldb_acc = ldb.get_access(); - auto beta_acc = beta.get_access(); - auto ldc_acc = ldc.get_access(); - auto group_size_acc = group_size.get_access(); - int64_t stride_a, stride_b, stride_c, off_a = 0, off_b = 0, off_c = 0; - for (int64_t i = 0; i < group_count; i++) { - stride_a = - (transa_acc[i] == transpose::nontrans) ? lda_acc[i] * k_acc[i] : lda_acc[i] * m_acc[i]; - stride_b = - (transb_acc[i] == transpose::nontrans) ? ldb_acc[i] * n_acc[i] : ldb_acc[i] * k_acc[i]; - stride_c = ldc_acc[i] * n_acc[i]; - mkl::gpu::dgemm_batch( - queue, mkl::cblas_convert(transa_acc[i]), mkl::cblas_convert(transb_acc[i]), m_acc[i], - n_acc[i], k_acc[i], alpha_acc[i], a, lda_acc[i], stride_a, b, ldb_acc[i], stride_b, - beta_acc[i], c, ldc_acc[i], stride_c, group_size_acc[i], off_a, off_b, off_c); - off_a += stride_a * group_size_acc[i]; - off_b += stride_b * group_size_acc[i]; - off_c += stride_c * group_size_acc[i]; - } -} - -void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, - cl::sycl::buffer &m, cl::sycl::buffer &n, - cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size) { - auto transa_acc = transa.get_access(); - auto transb_acc = transb.get_access(); - auto m_acc = m.get_access(); - auto n_acc = n.get_access(); - auto k_acc = k.get_access(); - auto alpha_acc = alpha.get_access(); - auto lda_acc = lda.get_access(); - auto ldb_acc = ldb.get_access(); - auto beta_acc = beta.get_access(); - auto ldc_acc = ldc.get_access(); - auto group_size_acc = group_size.get_access(); - int64_t stride_a, stride_b, stride_c, off_a = 0, off_b = 0, off_c = 0; - for (int64_t i = 0; i < group_count; i++) { - stride_a = - (transa_acc[i] == transpose::nontrans) ? lda_acc[i] * k_acc[i] : lda_acc[i] * m_acc[i]; - stride_b = - (transb_acc[i] == transpose::nontrans) ? ldb_acc[i] * n_acc[i] : ldb_acc[i] * k_acc[i]; - stride_c = ldc_acc[i] * n_acc[i]; - mkl::gpu::cgemm_batch( - queue, mkl::cblas_convert(transa_acc[i]), mkl::cblas_convert(transb_acc[i]), m_acc[i], - n_acc[i], k_acc[i], alpha_acc[i], a, lda_acc[i], stride_a, b, ldb_acc[i], stride_b, - beta_acc[i], c, ldc_acc[i], stride_c, group_size_acc[i], off_a, off_b, off_c); - off_a += stride_a * group_size_acc[i]; - off_b += stride_b * group_size_acc[i]; - off_c += stride_c * group_size_acc[i]; - } -} - -void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size) { - auto transa_acc = transa.get_access(); - auto transb_acc = transb.get_access(); - auto m_acc = m.get_access(); - auto n_acc = n.get_access(); - auto k_acc = k.get_access(); - auto alpha_acc = alpha.get_access(); - auto lda_acc = lda.get_access(); - auto ldb_acc = ldb.get_access(); - auto beta_acc = beta.get_access(); - auto ldc_acc = ldc.get_access(); - auto group_size_acc = group_size.get_access(); - int64_t stride_a, stride_b, stride_c, off_a = 0, off_b = 0, off_c = 0; - for (int64_t i = 0; i < group_count; i++) { - stride_a = - (transa_acc[i] == transpose::nontrans) ? lda_acc[i] * k_acc[i] : lda_acc[i] * m_acc[i]; - stride_b = - (transb_acc[i] == transpose::nontrans) ? ldb_acc[i] * n_acc[i] : ldb_acc[i] * k_acc[i]; - stride_c = ldc_acc[i] * n_acc[i]; - mkl::gpu::zgemm_batch( - queue, mkl::cblas_convert(transa_acc[i]), mkl::cblas_convert(transb_acc[i]), m_acc[i], - n_acc[i], k_acc[i], alpha_acc[i], a, lda_acc[i], stride_a, b, ldb_acc[i], stride_b, - beta_acc[i], c, ldc_acc[i], stride_c, group_size_acc[i], off_a, off_b, off_c); - off_a += stride_a * group_size_acc[i]; - off_b += stride_b * group_size_acc[i]; - off_c += stride_c * group_size_acc[i]; - } -} - void gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, @@ -1184,135 +1063,6 @@ void gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transp batch_size); } -void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, - cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { - auto side_acc = left_right.get_access(); - auto uplo_acc = upper_lower.get_access(); - auto trans_acc = trans.get_access(); - auto diag_acc = unit_diag.get_access(); - auto m_acc = m.get_access(); - auto n_acc = n.get_access(); - auto alpha_acc = alpha.get_access(); - auto lda_acc = lda.get_access(); - auto ldb_acc = ldb.get_access(); - auto group_size_acc = group_size.get_access(); - int64_t stride_a, stride_b, off_a = 0, off_b = 0; - for (int64_t i = 0; i < group_count; i++) { - stride_a = (side_acc[i] == side::left) ? lda_acc[i] * m_acc[i] : lda_acc[i] * n_acc[i]; - stride_b = ldb_acc[i] * n_acc[i]; - mkl::gpu::strsm_batch(queue, mkl::cblas_convert(side_acc[i]), - mkl::cblas_convert(uplo_acc[i]), mkl::cblas_convert(trans_acc[i]), - mkl::cblas_convert(diag_acc[i]), m_acc[i], n_acc[i], alpha_acc[i], a, - lda_acc[i], stride_a, b, ldb_acc[i], stride_b, group_size_acc[i], - off_a, off_b); - off_a += stride_a * group_size_acc[i]; - off_b += stride_b * group_size_acc[i]; - } -} - -void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, - cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { - auto side_acc = left_right.get_access(); - auto uplo_acc = upper_lower.get_access(); - auto trans_acc = trans.get_access(); - auto diag_acc = unit_diag.get_access(); - auto m_acc = m.get_access(); - auto n_acc = n.get_access(); - auto alpha_acc = alpha.get_access(); - auto lda_acc = lda.get_access(); - auto ldb_acc = ldb.get_access(); - auto group_size_acc = group_size.get_access(); - int64_t stride_a, stride_b, off_a = 0, off_b = 0; - for (int64_t i = 0; i < group_count; i++) { - stride_a = (side_acc[i] == side::left) ? lda_acc[i] * m_acc[i] : lda_acc[i] * n_acc[i]; - stride_b = ldb_acc[i] * n_acc[i]; - mkl::gpu::dtrsm_batch(queue, mkl::cblas_convert(side_acc[i]), - mkl::cblas_convert(uplo_acc[i]), mkl::cblas_convert(trans_acc[i]), - mkl::cblas_convert(diag_acc[i]), m_acc[i], n_acc[i], alpha_acc[i], a, - lda_acc[i], stride_a, b, ldb_acc[i], stride_b, group_size_acc[i], - off_a, off_b); - off_a += stride_a * group_size_acc[i]; - off_b += stride_b * group_size_acc[i]; - } -} - -void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, - cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, - cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { - auto side_acc = left_right.get_access(); - auto uplo_acc = upper_lower.get_access(); - auto trans_acc = trans.get_access(); - auto diag_acc = unit_diag.get_access(); - auto m_acc = m.get_access(); - auto n_acc = n.get_access(); - auto alpha_acc = alpha.get_access(); - auto lda_acc = lda.get_access(); - auto ldb_acc = ldb.get_access(); - auto group_size_acc = group_size.get_access(); - int64_t stride_a, stride_b, off_a = 0, off_b = 0; - for (int64_t i = 0; i < group_count; i++) { - stride_a = (side_acc[i] == side::left) ? lda_acc[i] * m_acc[i] : lda_acc[i] * n_acc[i]; - stride_b = ldb_acc[i] * n_acc[i]; - mkl::gpu::ctrsm_batch(queue, mkl::cblas_convert(side_acc[i]), - mkl::cblas_convert(uplo_acc[i]), mkl::cblas_convert(trans_acc[i]), - mkl::cblas_convert(diag_acc[i]), m_acc[i], n_acc[i], alpha_acc[i], a, - lda_acc[i], stride_a, b, ldb_acc[i], stride_b, group_size_acc[i], - off_a, off_b); - off_a += stride_a * group_size_acc[i]; - off_b += stride_b * group_size_acc[i]; - } -} - -void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { - auto side_acc = left_right.get_access(); - auto uplo_acc = upper_lower.get_access(); - auto trans_acc = trans.get_access(); - auto diag_acc = unit_diag.get_access(); - auto m_acc = m.get_access(); - auto n_acc = n.get_access(); - auto alpha_acc = alpha.get_access(); - auto lda_acc = lda.get_access(); - auto ldb_acc = ldb.get_access(); - auto group_size_acc = group_size.get_access(); - int64_t stride_a, stride_b, off_a = 0, off_b = 0; - for (int64_t i = 0; i < group_count; i++) { - stride_a = (side_acc[i] == side::left) ? lda_acc[i] * m_acc[i] : lda_acc[i] * n_acc[i]; - stride_b = ldb_acc[i] * n_acc[i]; - mkl::gpu::ztrsm_batch(queue, mkl::cblas_convert(side_acc[i]), - mkl::cblas_convert(uplo_acc[i]), mkl::cblas_convert(trans_acc[i]), - mkl::cblas_convert(diag_acc[i]), m_acc[i], n_acc[i], alpha_acc[i], a, - lda_acc[i], stride_a, b, ldb_acc[i], stride_b, group_size_acc[i], - off_a, off_b); - off_a += stride_a * group_size_acc[i]; - off_b += stride_b * group_size_acc[i]; - } -} - void trsm_batch(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, float alpha, cl::sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, @@ -1356,51 +1106,52 @@ void trsm_batch(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo up } void gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa, - onemkl::transpose transb, int64_t n, int64_t k, float alpha, - cl::sycl::buffer &a, int64_t lda, cl::sycl::buffer &b, int64_t ldb, - float beta, cl::sycl::buffer &c, int64_t ldc) { + onemkl::transpose transb, std::int64_t n, std::int64_t k, float alpha, + cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &b, + std::int64_t ldb, float beta, cl::sycl::buffer &c, std::int64_t ldc) { mkl::gpu::sgemmt(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa), mkl::cblas_convert(transb), n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa, - onemkl::transpose transb, int64_t n, int64_t k, double alpha, - cl::sycl::buffer &a, int64_t lda, cl::sycl::buffer &b, int64_t ldb, - double beta, cl::sycl::buffer &c, int64_t ldc) { + onemkl::transpose transb, std::int64_t n, std::int64_t k, double alpha, + cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &b, + std::int64_t ldb, double beta, cl::sycl::buffer &c, std::int64_t ldc) { mkl::gpu::dgemmt(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa), mkl::cblas_convert(transb), n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa, - onemkl::transpose transb, int64_t n, int64_t k, std::complex alpha, - cl::sycl::buffer, 1> &a, int64_t lda, - cl::sycl::buffer, 1> &b, int64_t ldb, std::complex beta, - cl::sycl::buffer, 1> &c, int64_t ldc) { + onemkl::transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &b, std::int64_t ldb, + std::complex beta, cl::sycl::buffer, 1> &c, + std::int64_t ldc) { mkl::gpu::zgemmt(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa), mkl::cblas_convert(transb), n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa, - onemkl::transpose transb, int64_t n, int64_t k, std::complex alpha, - cl::sycl::buffer, 1> &a, int64_t lda, - cl::sycl::buffer, 1> &b, int64_t ldb, std::complex beta, - cl::sycl::buffer, 1> &c, int64_t ldc) { + onemkl::transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, + cl::sycl::buffer, 1> &a, std::int64_t lda, + cl::sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, + cl::sycl::buffer, 1> &c, std::int64_t ldc) { mkl::gpu::cgemmt(queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa), mkl::cblas_convert(transb), n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, int64_t m, - int64_t n, int64_t k, half alpha, cl::sycl::buffer &a, int64_t lda, - cl::sycl::buffer &b, int64_t ldb, half beta, cl::sycl::buffer &c, - int64_t ldc) { +void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, half alpha, cl::sycl::buffer &a, + std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, half beta, + cl::sycl::buffer &c, std::int64_t ldc) { mkl::gpu::hgemm(queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemm_ext(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, int64_t m, - int64_t n, int64_t k, float alpha, cl::sycl::buffer &a, int64_t lda, - cl::sycl::buffer &b, int64_t ldb, float beta, cl::sycl::buffer &c, - int64_t ldc) { +void gemm_ext(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &b, + std::int64_t ldb, float beta, cl::sycl::buffer &c, std::int64_t ldc) { mkl::gpu::gemm_f16f16f32(queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } @@ -1459,40 +1210,1457 @@ static inline void copy_mat(T_src &src, int row, int col, int ld, onemkl::offset } void gemm_ext(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, - onemkl::offset offsetc, int64_t m, int64_t n, int64_t k, float alpha, - cl::sycl::buffer &a, int64_t lda, int8_t ao, - cl::sycl::buffer &b, int64_t ldb, uint8_t bo, float beta, - cl::sycl::buffer &c, int64_t ldc, cl::sycl::buffer &co) { + onemkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + cl::sycl::buffer &a, std::int64_t lda, int8_t ao, + cl::sycl::buffer &b, std::int64_t ldb, uint8_t bo, float beta, + cl::sycl::buffer &c, std::int64_t ldc, cl::sycl::buffer &co) { // DGEMM is used for reference implementation to maximize accuracy. // Optimized implementation for specific architectures will be added in future releases. - int64_t sizea, sizeb, sizec; - sizea = (transa == onemkl::transpose::nontrans) ? lda * k : lda * m; - sizeb = (transb == onemkl::transpose::nontrans) ? ldb * n : ldb * k; - sizec = ldc * n; - double *ad = (double *)onemkl::aligned_alloc(64, sizeof(double) * sizea); - double *bd = (double *)onemkl::aligned_alloc(64, sizeof(double) * sizeb); - double *cd = (double *)onemkl::aligned_alloc(64, sizeof(double) * sizec); - double aod = ao; - double bod = bo; - auto acc_a = a.template get_access(); - auto acc_b = b.template get_access(); - auto acc_c = c.template get_access(); - auto acc_co = co.template get_access(); - copy_mat(acc_a, transa, m, k, lda, aod, ad); - copy_mat(acc_b, transb, k, n, ldb, bod, bd); - copy_mat(acc_c, onemkl::transpose::nontrans, m, n, ldc, 0.0, cd); - cl::sycl::buffer A_buf(ad, sizea); - cl::sycl::buffer B_buf(bd, sizeb); - cl::sycl::buffer C_buf(cd, sizec); - mkl::gpu::dgemm(queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m, n, k, alpha, - A_buf, lda, B_buf, ldb, beta, C_buf, ldc); - auto acc_cd = C_buf.template get_access(); - copy_mat(acc_cd, m, n, ldc, offsetc, acc_co, acc_c); + std::int64_t sizea, sizeb, sizec; + sizea = (transa == onemkl::transpose::nontrans) ? lda * k : lda * m; + sizeb = (transb == onemkl::transpose::nontrans) ? ldb * n : ldb * k; + sizec = ldc * n; + double *ad = (double *)onemkl::aligned_alloc(64, sizeof(double) * sizea); + double *bd = (double *)onemkl::aligned_alloc(64, sizeof(double) * sizeb); + double *cd = (double *)onemkl::aligned_alloc(64, sizeof(double) * sizec); + { + double alphad = alpha; + double betad = beta; + double aod = ao; + double bod = bo; + auto acc_a = a.template get_access(); + auto acc_b = b.template get_access(); + auto acc_c = c.template get_access(); + auto acc_co = co.template get_access(); + copy_mat(acc_a, transa, m, k, lda, aod, ad); + copy_mat(acc_b, transb, k, n, ldb, bod, bd); + copy_mat(acc_c, onemkl::transpose::nontrans, m, n, ldc, 0.0, cd); + cl::sycl::buffer A_buf(ad, sizea); + cl::sycl::buffer B_buf(bd, sizeb); + cl::sycl::buffer C_buf(cd, sizec); + mkl::gpu::dgemm(queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m, n, k, + alphad, A_buf, lda, B_buf, ldb, betad, C_buf, ldc); + auto acc_cd = C_buf.template get_access(); + copy_mat(acc_cd, m, n, ldc, offsetc, acc_co, acc_c); + } onemkl::aligned_free(ad); onemkl::aligned_free(bd); onemkl::aligned_free(cd); } +// USM APIs + +cl::sycl::event gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float *a, + std::int64_t lda, const float *b, std::int64_t ldb, float beta, float *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::sgemm_sycl(&queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m, + n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double *a, + std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dgemm_sycl(&queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m, + n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::cgemm_sycl(&queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m, + n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::zgemm_sycl(&queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m, + n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, + const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::ssymm_sycl(&queue, mkl::cblas_convert(left_right), + mkl::cblas_convert(upper_lower), m, n, alpha, a, lda, b, ldb, beta, + c, ldc, dependencies); +} + +cl::sycl::event symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + std::int64_t m, std::int64_t n, double alpha, const double *a, + std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dsymm_sycl(&queue, mkl::cblas_convert(left_right), + mkl::cblas_convert(upper_lower), m, n, alpha, a, lda, b, ldb, beta, + c, ldc, dependencies); +} + +cl::sycl::event symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::csymm_sycl(&queue, mkl::cblas_convert(left_right), + mkl::cblas_convert(upper_lower), m, n, alpha, a, lda, b, ldb, beta, + c, ldc, dependencies); +} + +cl::sycl::event symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::zsymm_sycl(&queue, mkl::cblas_convert(left_right), + mkl::cblas_convert(upper_lower), m, n, alpha, a, lda, b, ldb, beta, + c, ldc, dependencies); +} + +cl::sycl::event hemm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::chemm_sycl(&queue, mkl::cblas_convert(left_right), + mkl::cblas_convert(upper_lower), m, n, alpha, a, lda, b, ldb, beta, + c, ldc, dependencies); +} + +cl::sycl::event hemm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::zhemm_sycl(&queue, mkl::cblas_convert(left_right), + mkl::cblas_convert(upper_lower), m, n, alpha, a, lda, b, ldb, beta, + c, ldc, dependencies); +} + +cl::sycl::event syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, + float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::ssyrk_sycl(&queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), + n, k, alpha, a, lda, beta, c, ldc, dependencies); +} + +cl::sycl::event syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, double alpha, const double *a, + std::int64_t lda, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dsyrk_sycl(&queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), + n, k, alpha, a, lda, beta, c, ldc, dependencies); +} + +cl::sycl::event syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::csyrk_sycl(&queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), + n, k, alpha, a, lda, beta, c, ldc, dependencies); +} + +cl::sycl::event syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::zsyrk_sycl(&queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), + n, k, alpha, a, lda, beta, c, ldc, dependencies); +} + +cl::sycl::event herk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, float alpha, const std::complex *a, + std::int64_t lda, float beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::cherk_sycl(&queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), + n, k, alpha, a, lda, beta, c, ldc, dependencies); +} + +cl::sycl::event herk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, double alpha, const std::complex *a, + std::int64_t lda, double beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::zherk_sycl(&queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), + n, k, alpha, a, lda, beta, c, ldc, dependencies); +} + +cl::sycl::event syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, + const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::ssyr2k_sycl(&queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), + n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, double alpha, const double *a, + std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dsyr2k_sycl(&queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), + n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::csyr2k_sycl(&queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), + n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::zsyr2k_sycl(&queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), + n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event her2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, float beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::cher2k_sycl(&queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), + n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event her2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, double beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::zher2k_sycl(&queue, mkl::cblas_convert(upper_lower), mkl::cblas_convert(trans), + n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, + std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b, + std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::strmm_sycl(&queue, mkl::cblas_convert(left_right), + mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa), + mkl::cblas_convert(unit_diag), m, n, alpha, a, lda, b, ldb, + dependencies); +} + +cl::sycl::event trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, + std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b, + std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dtrmm_sycl(&queue, mkl::cblas_convert(left_right), + mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa), + mkl::cblas_convert(unit_diag), m, n, alpha, a, lda, b, ldb, + dependencies); +} + +cl::sycl::event trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::ctrmm_sycl(&queue, mkl::cblas_convert(left_right), + mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa), + mkl::cblas_convert(unit_diag), m, n, alpha, a, lda, b, ldb, + dependencies); +} + +cl::sycl::event trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::ztrmm_sycl(&queue, mkl::cblas_convert(left_right), + mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa), + mkl::cblas_convert(unit_diag), m, n, alpha, a, lda, b, ldb, + dependencies); +} + +cl::sycl::event trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, + std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b, + std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::strsm_sycl(&queue, mkl::cblas_convert(left_right), + mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa), + mkl::cblas_convert(unit_diag), m, n, alpha, a, lda, b, ldb, + dependencies); +} + +cl::sycl::event trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, + std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b, + std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dtrsm_sycl(&queue, mkl::cblas_convert(left_right), + mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa), + mkl::cblas_convert(unit_diag), m, n, alpha, a, lda, b, ldb, + dependencies); +} + +cl::sycl::event trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::ctrsm_sycl(&queue, mkl::cblas_convert(left_right), + mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa), + mkl::cblas_convert(unit_diag), m, n, alpha, a, lda, b, ldb, + dependencies); +} + +cl::sycl::event trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::ztrsm_sycl(&queue, mkl::cblas_convert(left_right), + mkl::cblas_convert(upper_lower), mkl::cblas_convert(transa), + mkl::cblas_convert(unit_diag), m, n, alpha, a, lda, b, ldb, + dependencies); +} + +cl::sycl::event gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, + std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *x, + std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::sgemv_sycl(&queue, mkl::cblas_convert(trans), m, n, alpha, a, lda, x, incx, + beta, y, incy, dependencies); +} + +cl::sycl::event gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, + std::int64_t n, double alpha, const double *a, std::int64_t lda, + const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dgemv_sycl(&queue, mkl::cblas_convert(trans), m, n, alpha, a, lda, x, incx, + beta, y, incy, dependencies); +} + +cl::sycl::event gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::cgemv_sycl(&queue, mkl::cblas_convert(trans), m, n, alpha, a, lda, x, incx, + beta, y, incy, dependencies); +} + +cl::sycl::event gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::zgemv_sycl(&queue, mkl::cblas_convert(trans), m, n, alpha, a, lda, x, incx, + beta, y, incy, dependencies); +} + +cl::sycl::event gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, const float *a, + std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::sgbmv_sycl(&queue, mkl::cblas_convert(trans), m, n, kl, ku, alpha, a, lda, x, + incx, beta, y, incy, dependencies); +} + +cl::sycl::event gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, + const double *a, std::int64_t lda, const double *x, std::int64_t incx, + double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dgbmv_sycl(&queue, mkl::cblas_convert(trans), m, n, kl, ku, alpha, a, lda, x, + incx, beta, y, incy, dependencies); +} + +cl::sycl::event gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *x, + std::int64_t incx, std::complex beta, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::cgbmv_sycl(&queue, mkl::cblas_convert(trans), m, n, kl, ku, alpha, a, lda, x, + incx, beta, y, incy, dependencies); +} + +cl::sycl::event gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *x, + std::int64_t incx, std::complex beta, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::zgbmv_sycl(&queue, mkl::cblas_convert(trans), m, n, kl, ku, alpha, a, lda, x, + incx, beta, y, incy, dependencies); +} + +cl::sycl::event ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, + const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a, + std::int64_t lda, const cl::sycl::vector_class &dependencies) { + return mkl::gpu::sger_sycl(&queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); +} + +cl::sycl::event ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, + const double *x, std::int64_t incx, const double *y, std::int64_t incy, + double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dger_sycl(&queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); +} + +cl::sycl::event gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::cgerc_sycl(&queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); +} + +cl::sycl::event gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::zgerc_sycl(&queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); +} + +cl::sycl::event geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::cgeru_sycl(&queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); +} + +cl::sycl::event geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::zgeru_sycl(&queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); +} + +cl::sycl::event hbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::chbmv_sycl(&queue, mkl::cblas_convert(uplo), n, k, alpha, a, lda, x, incx, + beta, y, incy, dependencies); +} + +cl::sycl::event hbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::zhbmv_sycl(&queue, mkl::cblas_convert(uplo), n, k, alpha, a, lda, x, incx, + beta, y, incy, dependencies); +} + +cl::sycl::event hemv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::chemv_sycl(&queue, mkl::cblas_convert(uplo), n, alpha, a, lda, x, incx, beta, + y, incy, dependencies); +} + +cl::sycl::event hemv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::zhemv_sycl(&queue, mkl::cblas_convert(uplo), n, alpha, a, lda, x, incx, beta, + y, incy, dependencies); +} + +cl::sycl::event her(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha, + const std::complex *x, std::int64_t incx, std::complex *a, + std::int64_t lda, const cl::sycl::vector_class &dependencies) { + return mkl::gpu::cher_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a, lda, + dependencies); +} + +cl::sycl::event her(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha, + const std::complex *x, std::int64_t incx, std::complex *a, + std::int64_t lda, const cl::sycl::vector_class &dependencies) { + return mkl::gpu::zher_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a, lda, + dependencies); +} + +cl::sycl::event her2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::cher2_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a, + lda, dependencies); +} + +cl::sycl::event her2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::zher2_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a, + lda, dependencies); +} + +cl::sycl::event hpmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, + std::complex alpha, const std::complex *a, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::chpmv_sycl(&queue, mkl::cblas_convert(uplo), n, alpha, a, x, incx, beta, y, + incy, dependencies); +} + +cl::sycl::event hpmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, + std::complex alpha, const std::complex *a, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::zhpmv_sycl(&queue, mkl::cblas_convert(uplo), n, alpha, a, x, incx, beta, y, + incy, dependencies); +} + +cl::sycl::event hpr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha, + const std::complex *x, std::int64_t incx, std::complex *a, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::chpr_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a, + dependencies); +} + +cl::sycl::event hpr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha, + const std::complex *x, std::int64_t incx, std::complex *a, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::zhpr_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a, + dependencies); +} + +cl::sycl::event hpr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::chpr2_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a, + dependencies); +} + +cl::sycl::event hpr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::zhpr2_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a, + dependencies); +} + +cl::sycl::event sbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k, + float alpha, const float *a, std::int64_t lda, const float *x, + std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::ssbmv_sycl(&queue, mkl::cblas_convert(uplo), n, k, alpha, a, lda, x, incx, + beta, y, incy, dependencies); +} + +cl::sycl::event sbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k, + double alpha, const double *a, std::int64_t lda, const double *x, + std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dsbmv_sycl(&queue, mkl::cblas_convert(uplo), n, k, alpha, a, lda, x, incx, + beta, y, incy, dependencies); +} + +cl::sycl::event spmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, float alpha, + const float *a, const float *x, std::int64_t incx, float beta, float *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::sspmv_sycl(&queue, mkl::cblas_convert(uplo), n, alpha, a, x, incx, beta, y, + incy, dependencies); +} + +cl::sycl::event spmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, double alpha, + const double *a, const double *x, std::int64_t incx, double beta, double *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dspmv_sycl(&queue, mkl::cblas_convert(uplo), n, alpha, a, x, incx, beta, y, + incy, dependencies); +} + +cl::sycl::event spr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha, + const float *x, std::int64_t incx, float *a, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::sspr_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a, + dependencies); +} + +cl::sycl::event spr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha, + const double *x, std::int64_t incx, double *a, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dspr_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a, + dependencies); +} + +cl::sycl::event spr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha, + const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::sspr2_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a, + dependencies); +} + +cl::sycl::event spr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha, + const double *x, std::int64_t incx, const double *y, std::int64_t incy, + double *a, const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dspr2_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a, + dependencies); +} + +cl::sycl::event symv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, float alpha, + const float *a, std::int64_t lda, const float *x, std::int64_t incx, + float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::ssymv_sycl(&queue, mkl::cblas_convert(uplo), n, alpha, a, lda, x, incx, beta, + y, incy, dependencies); +} + +cl::sycl::event symv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, double alpha, + const double *a, std::int64_t lda, const double *x, std::int64_t incx, + double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dsymv_sycl(&queue, mkl::cblas_convert(uplo), n, alpha, a, lda, x, incx, beta, + y, incy, dependencies); +} + +cl::sycl::event syr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha, + const float *x, std::int64_t incx, float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::ssyr_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a, lda, + dependencies); +} + +cl::sycl::event syr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha, + const double *x, std::int64_t incx, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dsyr_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, a, lda, + dependencies); +} + +cl::sycl::event syr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha, + const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::ssyr2_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a, + lda, dependencies); +} + +cl::sycl::event syr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha, + const double *x, std::int64_t incx, const double *y, std::int64_t incy, + double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dsyr2_sycl(&queue, mkl::cblas_convert(upplo), n, alpha, x, incx, y, incy, a, + lda, dependencies); +} + +cl::sycl::event tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, std::int64_t k, const float *a, + std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::stbmv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), + mkl::cblas_convert(diag), n, k, a, lda, x, incx, dependencies); +} + +cl::sycl::event tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, std::int64_t k, const double *a, + std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dtbmv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), + mkl::cblas_convert(diag), n, k, a, lda, x, incx, dependencies); +} + +cl::sycl::event tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, std::int64_t k, + const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::ctbmv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), + mkl::cblas_convert(diag), n, k, a, lda, x, incx, dependencies); +} + +cl::sycl::event tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, std::int64_t k, + const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::ztbmv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), + mkl::cblas_convert(diag), n, k, a, lda, x, incx, dependencies); +} + +cl::sycl::event tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, std::int64_t k, const float *a, + std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::stbsv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), + mkl::cblas_convert(diag), n, k, a, lda, x, incx, dependencies); +} + +cl::sycl::event tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, std::int64_t k, const double *a, + std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dtbsv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), + mkl::cblas_convert(diag), n, k, a, lda, x, incx, dependencies); +} + +cl::sycl::event tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, std::int64_t k, + const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::ctbsv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), + mkl::cblas_convert(diag), n, k, a, lda, x, incx, dependencies); +} + +cl::sycl::event tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, std::int64_t k, + const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::ztbsv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), + mkl::cblas_convert(diag), n, k, a, lda, x, incx, dependencies); +} + +cl::sycl::event tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const float *a, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::stpmv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), + mkl::cblas_convert(diag), n, a, x, incx, dependencies); +} + +cl::sycl::event tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const double *a, double *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dtpmv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), + mkl::cblas_convert(diag), n, a, x, incx, dependencies); +} + +cl::sycl::event tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const std::complex *a, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::ctpmv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), + mkl::cblas_convert(diag), n, a, x, incx, dependencies); +} + +cl::sycl::event tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const std::complex *a, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::ztpmv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), + mkl::cblas_convert(diag), n, a, x, incx, dependencies); +} + +cl::sycl::event tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const float *a, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::stpsv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), + mkl::cblas_convert(diag), n, a, x, incx, dependencies); +} + +cl::sycl::event tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const double *a, double *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dtpsv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), + mkl::cblas_convert(diag), n, a, x, incx, dependencies); +} + +cl::sycl::event tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const std::complex *a, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::ctpsv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), + mkl::cblas_convert(diag), n, a, x, incx, dependencies); +} + +cl::sycl::event tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const std::complex *a, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::ztpsv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), + mkl::cblas_convert(diag), n, a, x, incx, dependencies); +} + +cl::sycl::event trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const float *a, std::int64_t lda, float *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::strmv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), + mkl::cblas_convert(diag), n, a, lda, x, incx, dependencies); +} + +cl::sycl::event trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const double *a, std::int64_t lda, + double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dtrmv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), + mkl::cblas_convert(diag), n, a, lda, x, incx, dependencies); +} + +cl::sycl::event trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::ctrmv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), + mkl::cblas_convert(diag), n, a, lda, x, incx, dependencies); +} + +cl::sycl::event trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::ztrmv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), + mkl::cblas_convert(diag), n, a, lda, x, incx, dependencies); +} + +cl::sycl::event trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const float *a, std::int64_t lda, float *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::strsv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), + mkl::cblas_convert(diag), n, a, lda, x, incx, dependencies); +} + +cl::sycl::event trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const double *a, std::int64_t lda, + double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dtrsv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), + mkl::cblas_convert(diag), n, a, lda, x, incx, dependencies); +} + +cl::sycl::event trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::ctrsv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), + mkl::cblas_convert(diag), n, a, lda, x, incx, dependencies); +} + +cl::sycl::event trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::ztrsv_sycl(&queue, mkl::cblas_convert(upplo), mkl::cblas_convert(trans), + mkl::cblas_convert(diag), n, a, lda, x, incx, dependencies); +} + +cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::scasum_sycl(&queue, n, x, incx, result, dependencies); +} + +cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dzasum_sycl(&queue, n, x, incx, result, dependencies); +} + +cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + float *result, const cl::sycl::vector_class &dependencies) { + return mkl::gpu::sasum_sycl(&queue, n, x, incx, result, dependencies); +} + +cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + double *result, const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dasum_sycl(&queue, n, x, incx, result, dependencies); +} + +cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, float alpha, const float *x, + std::int64_t incx, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::saxpy_sycl(&queue, n, alpha, x, incx, y, incy, dependencies); +} + +cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, double alpha, const double *x, + std::int64_t incx, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::daxpy_sycl(&queue, n, alpha, x, incx, y, incy, dependencies); +} + +cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::caxpy_sycl(&queue, n, alpha, x, incx, y, incy, dependencies); +} + +cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::zaxpy_sycl(&queue, n, alpha, x, incx, y, incy, dependencies); +} + +cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::scopy_sycl(&queue, n, x, incx, y, incy, dependencies); +} + +cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dcopy_sycl(&queue, n, x, incx, y, incy, dependencies); +} + +cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::ccopy_sycl(&queue, n, x, incx, y, incy, dependencies); +} + +cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::zcopy_sycl(&queue, n, x, incx, y, incy, dependencies); +} + +cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + const float *y, std::int64_t incy, float *result, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::sdot_sycl(&queue, n, x, incx, y, incy, result, dependencies); +} + +cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + const double *y, std::int64_t incy, double *result, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::ddot_sycl(&queue, n, x, incx, y, incy, result, dependencies); +} + +cl::sycl::event sdsdot(cl::sycl::queue &queue, std::int64_t n, float sb, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, float *result, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::sdsdot_sycl(&queue, n, sb, x, incx, y, incy, result, dependencies); +} + +cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + const float *y, std::int64_t incy, double *result, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dsdot_sycl(&queue, n, x, incx, y, incy, result, dependencies); +} + +cl::sycl::event dotc(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::cdotc_sycl(&queue, n, x, incx, y, incy, result, dependencies); +} + +cl::sycl::event dotc(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::zdotc_sycl(&queue, n, x, incx, y, incy, result, dependencies); +} + +cl::sycl::event dotu(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::cdotu_sycl(&queue, n, x, incx, y, incy, result, dependencies); +} + +cl::sycl::event dotu(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::zdotu_sycl(&queue, n, x, incx, y, incy, result, dependencies); +} + +cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::scnrm2_sycl(&queue, n, x, incx, result, dependencies); +} + +cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dznrm2_sycl(&queue, n, x, incx, result, dependencies); +} + +cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + float *result, const cl::sycl::vector_class &dependencies) { + return mkl::gpu::snrm2_sycl(&queue, n, x, incx, result, dependencies); +} + +cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + double *result, const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dnrm2_sycl(&queue, n, x, incx, result, dependencies); +} + +cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, float c, float s, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::csrot_sycl(&queue, n, x, incx, y, incy, c, s, dependencies); +} + +cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, double c, + double s, const cl::sycl::vector_class &dependencies) { + return mkl::gpu::zdrot_sycl(&queue, n, x, incx, y, incy, c, s, dependencies); +} + +cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, float c, float s, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::srot_sycl(&queue, n, x, incx, y, incy, c, s, dependencies); +} + +cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, + std::int64_t incy, double c, double s, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::drot_sycl(&queue, n, x, incx, y, incy, c, s, dependencies); +} + +cl::sycl::event rotg(cl::sycl::queue &queue, float *a, float *b, float *c, float *s, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::srotg_sycl(&queue, a, b, c, s, dependencies); +} + +cl::sycl::event rotg(cl::sycl::queue &queue, double *a, double *b, double *c, double *s, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::drotg_sycl(&queue, a, b, c, s, dependencies); +} + +cl::sycl::event rotg(cl::sycl::queue &queue, std::complex *a, std::complex *b, + float *c, std::complex *s, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::crotg_sycl(&queue, a, b, c, s, dependencies); +} + +cl::sycl::event rotg(cl::sycl::queue &queue, std::complex *a, std::complex *b, + double *c, std::complex *s, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::zrotg_sycl(&queue, a, b, c, s, dependencies); +} + +cl::sycl::event rotm(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, float *param, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::srotm_sycl(&queue, n, x, incx, y, incy, param, dependencies); +} + +cl::sycl::event rotm(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, + double *y, std::int64_t incy, double *param, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::drotm_sycl(&queue, n, x, incx, y, incy, param, dependencies); +} + +cl::sycl::event rotmg(cl::sycl::queue &queue, float *d1, float *d2, float *x1, float y1, + float *param, const cl::sycl::vector_class &dependencies) { + return mkl::gpu::srotmg_sycl(&queue, d1, d2, x1, y1, param, dependencies); +} + +cl::sycl::event rotmg(cl::sycl::queue &queue, double *d1, double *d2, double *x1, double y1, + double *param, const cl::sycl::vector_class &dependencies) { + return mkl::gpu::drotmg_sycl(&queue, d1, d2, x1, y1, param, dependencies); +} + +cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, float alpha, float *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::sscal_sycl(&queue, n, alpha, x, incx, dependencies); +} + +cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, double alpha, double *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dscal_sycl(&queue, n, alpha, x, incx, dependencies); +} + +cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::cscal_sycl(&queue, n, alpha, x, incx, dependencies); +} + +cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::zscal_sycl(&queue, n, alpha, x, incx, dependencies); +} + +cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, float alpha, std::complex *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::csscal_sycl(&queue, n, alpha, x, incx, dependencies); +} + +cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, double alpha, std::complex *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::zdscal_sycl(&queue, n, alpha, x, incx, dependencies); +} + +cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::sswap_sycl(&queue, n, x, incx, y, incy, dependencies); +} + +cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, + double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dswap_sycl(&queue, n, x, incx, y, incy, dependencies); +} + +cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::cswap_sycl(&queue, n, x, incx, y, incy, dependencies); +} + +cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::zswap_sycl(&queue, n, x, incx, y, incy, dependencies); +} + +cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + std::int64_t *result, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::isamax_sycl(&queue, n, x, incx, result, dependencies); +} + +cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + std::int64_t *result, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::idamax_sycl(&queue, n, x, incx, result, dependencies); +} + +cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::icamax_sycl(&queue, n, x, incx, result, dependencies); +} + +cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::izamax_sycl(&queue, n, x, incx, result, dependencies); +} + +cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + std::int64_t *result, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::isamin_sycl(&queue, n, x, incx, result, dependencies); +} + +cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + std::int64_t *result, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::idamin_sycl(&queue, n, x, incx, result, dependencies); +} + +cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::icamin_sycl(&queue, n, x, incx, result, dependencies); +} + +cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::izamin_sycl(&queue, n, x, incx, result, dependencies); +} + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + const float *a, std::int64_t lda, std::int64_t stride_a, const float *b, + std::int64_t ldb, std::int64_t stride_b, float beta, float *c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::sgemm_batch(queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m, + n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, + stride_c, batch_size, dependencies); +} + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, + const double *a, std::int64_t lda, std::int64_t stride_a, + const double *b, std::int64_t ldb, std::int64_t stride_b, double beta, + double *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dgemm_batch(queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m, + n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, + stride_c, batch_size, dependencies); +} + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, + std::int64_t lda, std::int64_t stride_a, const std::complex *b, + std::int64_t ldb, std::int64_t stride_b, std::complex beta, + std::complex *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::cgemm_batch(queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m, + n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, + stride_c, batch_size, dependencies); +} + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, + std::int64_t lda, std::int64_t stride_a, const std::complex *b, + std::int64_t ldb, std::int64_t stride_b, std::complex beta, + std::complex *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::zgemm_batch(queue, mkl::cblas_convert(transa), mkl::cblas_convert(transb), m, + n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, + stride_c, batch_size, dependencies); +} + +cl::sycl::event *coalesce_events(cl::sycl::queue &queue, std::vector &prereqs) { +#ifdef _WIN64 + for (std::int64_t i = 0; i < prereqs.size(); i++) + prereqs[i]->wait(); + return new cl::sycl::event(); +#else + if (prereqs.size() > 0) { + return new cl::sycl::event(queue.submit([&](cl::sycl::handler &cgh) { + for (std::int64_t i = 0; i < prereqs.size(); i++) + cgh.depends_on(*prereqs[i]); + cgh.single_task([]() { + }); + })); + } + else + return new cl::sycl::event(); +#endif +} + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb, + std::int64_t *m, std::int64_t *n, std::int64_t *k, float *alpha, + const float **a, std::int64_t *lda, const float **b, std::int64_t *ldb, + float *beta, float **c, std::int64_t *ldc, std::int64_t group_count, + std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + std::vector coalesced_events; + coalesced_events.reserve(group_count); + std::int64_t total_group_size = 0; + for (std::int64_t i = 0; i < group_count; i++) { + cl::sycl::event *gemm_batch_event = new cl::sycl::event(mkl::gpu::sgemm_batch( + queue, mkl::cblas_convert(transa[i]), mkl::cblas_convert(transb[i]), m[i], n[i], k[i], + alpha[i], a, lda[i], b, ldb[i], beta[i], c, ldc[i], total_group_size, group_size[i], + dependencies)); + coalesced_events.push_back(gemm_batch_event); + total_group_size += group_size[i]; + } + return *coalesce_events(queue, coalesced_events); +} + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb, + std::int64_t *m, std::int64_t *n, std::int64_t *k, double *alpha, + const double **a, std::int64_t *lda, const double **b, std::int64_t *ldb, + double *beta, double **c, std::int64_t *ldc, std::int64_t group_count, + std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + std::vector coalesced_events; + coalesced_events.reserve(group_count); + std::int64_t total_group_size = 0; + for (std::int64_t i = 0; i < group_count; i++) { + cl::sycl::event *gemm_batch_event = new cl::sycl::event(mkl::gpu::dgemm_batch( + queue, mkl::cblas_convert(transa[i]), mkl::cblas_convert(transb[i]), m[i], n[i], k[i], + alpha[i], a, lda[i], b, ldb[i], beta[i], c, ldc[i], total_group_size, group_size[i], + dependencies)); + coalesced_events.push_back(gemm_batch_event); + total_group_size += group_size[i]; + } + return *coalesce_events(queue, coalesced_events); +} + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb, + std::int64_t *m, std::int64_t *n, std::int64_t *k, + std::complex *alpha, const std::complex **a, + std::int64_t *lda, const std::complex **b, std::int64_t *ldb, + std::complex *beta, std::complex **c, std::int64_t *ldc, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + std::vector coalesced_events; + coalesced_events.reserve(group_count); + std::int64_t total_group_size = 0; + for (std::int64_t i = 0; i < group_count; i++) { + cl::sycl::event *gemm_batch_event = new cl::sycl::event(mkl::gpu::cgemm_batch( + queue, mkl::cblas_convert(transa[i]), mkl::cblas_convert(transb[i]), m[i], n[i], k[i], + alpha[i], a, lda[i], b, ldb[i], beta[i], c, ldc[i], total_group_size, group_size[i], + dependencies)); + coalesced_events.push_back(gemm_batch_event); + total_group_size += group_size[i]; + } + return *coalesce_events(queue, coalesced_events); +} + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, transpose *transa, transpose *transb, + std::int64_t *m, std::int64_t *n, std::int64_t *k, + std::complex *alpha, const std::complex **a, + std::int64_t *lda, const std::complex **b, std::int64_t *ldb, + std::complex *beta, std::complex **c, std::int64_t *ldc, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + std::vector coalesced_events; + coalesced_events.reserve(group_count); + std::int64_t total_group_size = 0; + for (std::int64_t i = 0; i < group_count; i++) { + cl::sycl::event *gemm_batch_event = new cl::sycl::event(mkl::gpu::zgemm_batch( + queue, mkl::cblas_convert(transa[i]), mkl::cblas_convert(transb[i]), m[i], n[i], k[i], + alpha[i], a, lda[i], b, ldb[i], beta[i], c, ldc[i], total_group_size, group_size[i], + dependencies)); + coalesced_events.push_back(gemm_batch_event); + total_group_size += group_size[i]; + } + return *coalesce_events(queue, coalesced_events); +} + +cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, float *alpha, const float **x, + std::int64_t *incx, float **y, std::int64_t *incy, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + std::vector coalesced_events; + coalesced_events.reserve(group_count); + std::int64_t total_group_size = 0; + for (std::int64_t i = 0; i < group_count; i++) { + cl::sycl::event *axpy_batch_event = new cl::sycl::event( + mkl::gpu::saxpy_batch(queue, n[i], alpha[i], x, incx[i], y, incy[i], group_size[i], + total_group_size, dependencies)); + coalesced_events.push_back(axpy_batch_event); + total_group_size += group_size[i]; + } + return *coalesce_events(queue, coalesced_events); +} + +cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, double *alpha, const double **x, + std::int64_t *incx, double **y, std::int64_t *incy, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + std::vector coalesced_events; + coalesced_events.reserve(group_count); + std::int64_t total_group_size = 0; + for (std::int64_t i = 0; i < group_count; i++) { + cl::sycl::event *axpy_batch_event = new cl::sycl::event( + mkl::gpu::daxpy_batch(queue, n[i], alpha[i], x, incx[i], y, incy[i], group_size[i], + total_group_size, dependencies)); + coalesced_events.push_back(axpy_batch_event); + total_group_size += group_size[i]; + } + return *coalesce_events(queue, coalesced_events); +} + +cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, std::complex *alpha, + const std::complex **x, std::int64_t *incx, + std::complex **y, std::int64_t *incy, std::int64_t group_count, + std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + std::vector coalesced_events; + coalesced_events.reserve(group_count); + std::int64_t total_group_size = 0; + for (std::int64_t i = 0; i < group_count; i++) { + cl::sycl::event *axpy_batch_event = new cl::sycl::event( + mkl::gpu::caxpy_batch(queue, n[i], alpha[i], x, incx[i], y, incy[i], group_size[i], + total_group_size, dependencies)); + coalesced_events.push_back(axpy_batch_event); + total_group_size += group_size[i]; + } + return *coalesce_events(queue, coalesced_events); +} + +cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, std::complex *alpha, + const std::complex **x, std::int64_t *incx, + std::complex **y, std::int64_t *incy, std::int64_t group_count, + std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + std::vector coalesced_events; + coalesced_events.reserve(group_count); + std::int64_t total_group_size = 0; + for (std::int64_t i = 0; i < group_count; i++) { + cl::sycl::event *axpy_batch_event = new cl::sycl::event( + mkl::gpu::zaxpy_batch(queue, n[i], alpha[i], x, incx[i], y, incy[i], group_size[i], + total_group_size, dependencies)); + coalesced_events.push_back(axpy_batch_event); + total_group_size += group_size[i]; + } + return *coalesce_events(queue, coalesced_events); +} + +cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, + std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, + const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::sgemmt_sycl(&queue, mkl::cblas_convert(upper_lower), + mkl::cblas_convert(transa), mkl::cblas_convert(transb), n, k, + alpha, a, lda, b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, + std::int64_t n, std::int64_t k, double alpha, const double *a, + std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::dgemmt_sycl(&queue, mkl::cblas_convert(upper_lower), + mkl::cblas_convert(transa), mkl::cblas_convert(transb), n, k, + alpha, a, lda, b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::cgemmt_sycl(&queue, mkl::cblas_convert(upper_lower), + mkl::cblas_convert(transa), mkl::cblas_convert(transb), n, k, + alpha, a, lda, b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event gemmt(cl::sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return mkl::gpu::zgemmt_sycl(&queue, mkl::cblas_convert(upper_lower), + mkl::cblas_convert(transa), mkl::cblas_convert(transb), n, k, + alpha, a, lda, b, ldb, beta, c, ldc, dependencies); +} + } //namespace internal } //namespace mklgpu } //namespace onemkl diff --git a/src/blas/backends/mklgpu/mkl_internal_blas_gpu_wrappers.hpp b/src/blas/backends/mklgpu/mkl_internal_blas_gpu_wrappers.hpp index e8de7044b..41288a12d 100644 --- a/src/blas/backends/mklgpu/mkl_internal_blas_gpu_wrappers.hpp +++ b/src/blas/backends/mklgpu/mkl_internal_blas_gpu_wrappers.hpp @@ -28,526 +28,644 @@ namespace onemkl { namespace mklgpu { namespace internal { + +// Buffer APIs + void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, float beta, cl::sycl::buffer &c, std::int64_t ldc); + void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, double alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, double beta, cl::sycl::buffer &c, std::int64_t ldc); + void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, cl::sycl::buffer, 1> &c, std::int64_t ldc); + void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, cl::sycl::buffer, 1> &c, std::int64_t ldc); + void symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m, std::int64_t n, float alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, float beta, cl::sycl::buffer &c, std::int64_t ldc); + void symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m, std::int64_t n, double alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, double beta, cl::sycl::buffer &c, std::int64_t ldc); + void symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, cl::sycl::buffer, 1> &c, std::int64_t ldc); + void symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, cl::sycl::buffer, 1> &c, std::int64_t ldc); + void hemm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, cl::sycl::buffer, 1> &c, std::int64_t ldc); + void hemm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, cl::sycl::buffer, 1> &c, std::int64_t ldc); + void syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer &a, std::int64_t lda, float beta, cl::sycl::buffer &c, std::int64_t ldc); + void syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n, std::int64_t k, double alpha, cl::sycl::buffer &a, std::int64_t lda, double beta, cl::sycl::buffer &c, std::int64_t ldc); + void syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, cl::sycl::buffer, 1> &c, std::int64_t ldc); + void syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, cl::sycl::buffer, 1> &c, std::int64_t ldc); + void herk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, float beta, cl::sycl::buffer, 1> &c, std::int64_t ldc); + void herk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n, std::int64_t k, double alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, double beta, cl::sycl::buffer, 1> &c, std::int64_t ldc); + void syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, float beta, cl::sycl::buffer &c, std::int64_t ldc); + void syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n, std::int64_t k, double alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, double beta, cl::sycl::buffer &c, std::int64_t ldc); + void syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, cl::sycl::buffer, 1> &c, std::int64_t ldc); + void syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, cl::sycl::buffer, 1> &c, std::int64_t ldc); + void her2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &b, std::int64_t ldb, float beta, cl::sycl::buffer, 1> &c, std::int64_t ldc); + void her2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &b, std::int64_t ldb, double beta, cl::sycl::buffer, 1> &c, std::int64_t ldc); + void trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, float alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb); + void trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, double alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb); + void trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &b, std::int64_t ldb); + void trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &b, std::int64_t ldb); + void trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, float alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb); + void trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, double alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb); + void trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &b, std::int64_t ldb); + void trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &b, std::int64_t ldb); + void gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n, float alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx, float beta, cl::sycl::buffer &y, std::int64_t incy); + void gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n, double alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx, double beta, cl::sycl::buffer &y, std::int64_t incy); + void gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, cl::sycl::buffer, 1> &y, std::int64_t incy); + void gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, cl::sycl::buffer, 1> &y, std::int64_t incy); + void gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx, float beta, cl::sycl::buffer &y, std::int64_t incy); + void gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx, double beta, cl::sycl::buffer &y, std::int64_t incy); + void gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, cl::sycl::buffer, 1> &y, std::int64_t incy); + void gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, cl::sycl::buffer, 1> &y, std::int64_t incy); + void ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, cl::sycl::buffer &a, std::int64_t lda); + void ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, cl::sycl::buffer &a, std::int64_t lda); + void gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy, cl::sycl::buffer, 1> &a, std::int64_t lda); + void gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy, cl::sycl::buffer, 1> &a, std::int64_t lda); + void geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy, cl::sycl::buffer, 1> &a, std::int64_t lda); + void geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy, cl::sycl::buffer, 1> &a, std::int64_t lda); + void hbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, cl::sycl::buffer, 1> &y, std::int64_t incy); + void hbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, cl::sycl::buffer, 1> &y, std::int64_t incy); + void hemv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, cl::sycl::buffer, 1> &y, std::int64_t incy); + void hemv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, cl::sycl::buffer, 1> &y, std::int64_t incy); + void her(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer, 1> &a, std::int64_t lda); + void her(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer, 1> &a, std::int64_t lda); + void her2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy, cl::sycl::buffer, 1> &a, std::int64_t lda); + void her2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy, cl::sycl::buffer, 1> &a, std::int64_t lda); + void hpmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, cl::sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, cl::sycl::buffer, 1> &y, std::int64_t incy); + void hpmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, cl::sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, cl::sycl::buffer, 1> &y, std::int64_t incy); + void hpr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer, 1> &a); + void hpr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer, 1> &a); + void hpr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy, cl::sycl::buffer, 1> &a); + void hpr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy, cl::sycl::buffer, 1> &a); + void sbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx, float beta, cl::sycl::buffer &y, std::int64_t incy); + void sbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k, double alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx, double beta, cl::sycl::buffer &y, std::int64_t incy); + void spmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, float alpha, cl::sycl::buffer &a, cl::sycl::buffer &x, std::int64_t incx, float beta, cl::sycl::buffer &y, std::int64_t incy); + void spmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, double alpha, cl::sycl::buffer &a, cl::sycl::buffer &x, std::int64_t incx, double beta, cl::sycl::buffer &y, std::int64_t incy); + void spr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &a); + void spr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &a); + void spr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, cl::sycl::buffer &a); + void spr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, cl::sycl::buffer &a); + void symv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, float alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx, float beta, cl::sycl::buffer &y, std::int64_t incy); + void symv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, double alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx, double beta, cl::sycl::buffer &y, std::int64_t incy); + void syr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &a, std::int64_t lda); + void syr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &a, std::int64_t lda); + void syr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, cl::sycl::buffer &a, std::int64_t lda); + void syr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, cl::sycl::buffer &a, std::int64_t lda); + void tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, std::int64_t n, std::int64_t k, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx); + void tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, std::int64_t n, std::int64_t k, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx); + void tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, std::int64_t n, std::int64_t k, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &x, std::int64_t incx); + void tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, std::int64_t n, std::int64_t k, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &x, std::int64_t incx); + void tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, std::int64_t n, std::int64_t k, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx); + void tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, std::int64_t n, std::int64_t k, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx); + void tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, std::int64_t n, std::int64_t k, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &x, std::int64_t incx); + void tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, std::int64_t n, std::int64_t k, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &x, std::int64_t incx); + void tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, std::int64_t n, cl::sycl::buffer &a, cl::sycl::buffer &x, std::int64_t incx); + void tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, std::int64_t n, cl::sycl::buffer &a, cl::sycl::buffer &x, std::int64_t incx); + void tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, std::int64_t n, cl::sycl::buffer, 1> &a, cl::sycl::buffer, 1> &x, std::int64_t incx); + void tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, std::int64_t n, cl::sycl::buffer, 1> &a, cl::sycl::buffer, 1> &x, std::int64_t incx); + void tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, std::int64_t n, cl::sycl::buffer &a, cl::sycl::buffer &x, std::int64_t incx); + void tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, std::int64_t n, cl::sycl::buffer &a, cl::sycl::buffer &x, std::int64_t incx); + void tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, std::int64_t n, cl::sycl::buffer, 1> &a, cl::sycl::buffer, 1> &x, std::int64_t incx); + void tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, std::int64_t n, cl::sycl::buffer, 1> &a, cl::sycl::buffer, 1> &x, std::int64_t incx); + void trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, std::int64_t n, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx); + void trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, std::int64_t n, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx); + void trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, std::int64_t n, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &x, std::int64_t incx); + void trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, std::int64_t n, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &x, std::int64_t incx); + void trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, std::int64_t n, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx); + void trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, std::int64_t n, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &x, std::int64_t incx); + void trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, std::int64_t n, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &x, std::int64_t incx); + void trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, onemkl::diag diag, std::int64_t n, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &x, std::int64_t incx); + void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer &result); + void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer &result); + void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &result); + void asum(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &result); + void axpy(cl::sycl::queue &queue, std::int64_t n, float alpha, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy); + void axpy(cl::sycl::queue &queue, std::int64_t n, double alpha, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy); + void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy); + void axpy(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy); + void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy); + void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy); + void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy); + void copy(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy); + void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, cl::sycl::buffer &result); + void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, cl::sycl::buffer &result); + void sdsdot(cl::sycl::queue &queue, std::int64_t n, float sb, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, cl::sycl::buffer &result); + void dot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, cl::sycl::buffer &result); + void dotc(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy, cl::sycl::buffer, 1> &result); + void dotc(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy, cl::sycl::buffer, 1> &result); + void dotu(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy, cl::sycl::buffer, 1> &result); + void dotu(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy, cl::sycl::buffer, 1> &result); + void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer &result); + void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer &result); + void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &result); + void nrm2(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &result); + void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy, float c, float s); + void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy, double c, double s); + void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, float c, float s); + void rot(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, double c, double s); + void rotg(cl::sycl::queue &queue, cl::sycl::buffer &a, cl::sycl::buffer &b, cl::sycl::buffer &c, cl::sycl::buffer &s); + void rotg(cl::sycl::queue &queue, cl::sycl::buffer &a, cl::sycl::buffer &b, cl::sycl::buffer &c, cl::sycl::buffer &s); + void rotg(cl::sycl::queue &queue, cl::sycl::buffer, 1> &a, cl::sycl::buffer, 1> &b, cl::sycl::buffer &c, cl::sycl::buffer, 1> &s); + void rotg(cl::sycl::queue &queue, cl::sycl::buffer, 1> &a, cl::sycl::buffer, 1> &b, cl::sycl::buffer &c, cl::sycl::buffer, 1> &s); + void rotm(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, cl::sycl::buffer ¶m); + void rotm(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy, cl::sycl::buffer ¶m); + void rotmg(cl::sycl::queue &queue, cl::sycl::buffer &d1, cl::sycl::buffer &d2, cl::sycl::buffer &x1, float y1, cl::sycl::buffer ¶m); + void rotmg(cl::sycl::queue &queue, cl::sycl::buffer &d1, cl::sycl::buffer &d2, cl::sycl::buffer &x1, double y1, cl::sycl::buffer ¶m); + void scal(cl::sycl::queue &queue, std::int64_t n, float alpha, cl::sycl::buffer &x, std::int64_t incx); + void scal(cl::sycl::queue &queue, std::int64_t n, double alpha, cl::sycl::buffer &x, std::int64_t incx); + void scal(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &x, std::int64_t incx); + void scal(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &x, std::int64_t incx); + void scal(cl::sycl::queue &queue, std::int64_t n, float alpha, cl::sycl::buffer, 1> &x, std::int64_t incx); + void scal(cl::sycl::queue &queue, std::int64_t n, double alpha, cl::sycl::buffer, 1> &x, std::int64_t incx); + void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy); + void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &y, std::int64_t incy); + void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy); + void swap(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer, 1> &y, std::int64_t incy); + void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &result); + void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &result); + void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer &result); + void iamax(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer &result); + void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &result); + void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer &x, std::int64_t incx, cl::sycl::buffer &result); + void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer &result); + void iamin(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer &result); -void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, - cl::sycl::buffer &m, cl::sycl::buffer &n, - cl::sycl::buffer &k, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, - cl::sycl::buffer &beta, cl::sycl::buffer &c, - cl::sycl::buffer &ldc, std::int64_t group_count, - cl::sycl::buffer &group_size); -void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, - cl::sycl::buffer &m, cl::sycl::buffer &n, - cl::sycl::buffer &k, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, - cl::sycl::buffer &beta, cl::sycl::buffer &c, - cl::sycl::buffer &ldc, std::int64_t group_count, - cl::sycl::buffer &group_size); -void gemm_batch(cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, - cl::sycl::buffer &m, cl::sycl::buffer &n, - cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size); -void gemm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size); + void gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, cl::sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, float beta, cl::sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); + void gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, double alpha, cl::sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, cl::sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, double beta, cl::sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); + void gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, @@ -555,6 +673,7 @@ void gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transp std::int64_t ldb, std::int64_t stride_b, std::complex beta, cl::sycl::buffer, 1> &c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); + void gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, @@ -562,84 +681,61 @@ void gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transp std::int64_t ldb, std::int64_t stride_b, std::complex beta, cl::sycl::buffer, 1> &c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, - cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size); -void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, - cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size); -void trsm_batch(cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, - cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, - cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size); -void trsm_batch( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size); + void trsm_batch(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, float alpha, cl::sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, cl::sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); + void trsm_batch(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, double alpha, cl::sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, cl::sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); + void trsm_batch(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, cl::sycl::buffer, 1> &b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); + void trsm_batch(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, cl::sycl::buffer, 1> &b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); + void gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa, onemkl::transpose transb, std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, float beta, cl::sycl::buffer &c, std::int64_t ldc); + void gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa, onemkl::transpose transb, std::int64_t n, std::int64_t k, double alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, double beta, cl::sycl::buffer &c, std::int64_t ldc); + void gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa, onemkl::transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, cl::sycl::buffer, 1> &c, std::int64_t ldc); + void gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa, onemkl::transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, cl::sycl::buffer, 1> &c, std::int64_t ldc); + void gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, half alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, half beta, cl::sycl::buffer &c, std::int64_t ldc); + void gemm_ext(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &b, @@ -651,6 +747,869 @@ void gemm_ext(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpos cl::sycl::buffer &b, std::int64_t ldb, uint8_t bo, float beta, cl::sycl::buffer &c, std::int64_t ldc, cl::sycl::buffer &co); +// USM APIs + +cl::sycl::event gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float *a, + std::int64_t lda, const float *b, std::int64_t ldb, float beta, float *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double *a, + std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemm(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, + const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + std::int64_t m, std::int64_t n, double alpha, const double *a, + std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event symm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event hemm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event hemm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, + float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, double alpha, const double *a, + std::int64_t lda, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event syrk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event herk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, float alpha, const std::complex *a, + std::int64_t lda, float beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event herk(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, double alpha, const std::complex *a, + std::int64_t lda, double beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, + const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, double alpha, const double *a, + std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event syr2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event her2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, float beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event her2k(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, double beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, + std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b, + std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, + std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b, + std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trmm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, + std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b, + std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, + std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b, + std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trsm(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose transa, onemkl::diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, + std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *x, + std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, + std::int64_t n, double alpha, const double *a, std::int64_t lda, + const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, const float *a, + std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, + const double *a, std::int64_t lda, const double *x, std::int64_t incx, + double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *x, + std::int64_t incx, std::complex beta, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gbmv(cl::sycl::queue &queue, onemkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *x, + std::int64_t incx, std::complex beta, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, + const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event ger(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, + const double *x, std::int64_t incx, const double *y, std::int64_t incy, + double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gerc(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event geru(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event hbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event hbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event hemv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event hemv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event her(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha, + const std::complex *x, std::int64_t incx, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event her(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha, + const std::complex *x, std::int64_t incx, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event her2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event her2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event hpmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, + std::complex alpha, const std::complex *a, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event hpmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, + std::complex alpha, const std::complex *a, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event hpr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha, + const std::complex *x, std::int64_t incx, std::complex *a, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event hpr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha, + const std::complex *x, std::int64_t incx, std::complex *a, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event hpr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event hpr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event sbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k, + float alpha, const float *a, std::int64_t lda, const float *x, + std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event sbmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, std::int64_t k, + double alpha, const double *a, std::int64_t lda, const double *x, + std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event spmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, float alpha, + const float *a, const float *x, std::int64_t incx, float beta, float *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event spmv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, double alpha, + const double *a, const double *x, std::int64_t incx, double beta, double *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event spr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha, + const float *x, std::int64_t incx, float *a, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event spr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha, + const double *x, std::int64_t incx, double *a, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event spr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha, + const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event spr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha, + const double *x, std::int64_t incx, const double *y, std::int64_t incy, + double *a, const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event symv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, float alpha, + const float *a, std::int64_t lda, const float *x, std::int64_t incx, + float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event symv(cl::sycl::queue &queue, onemkl::uplo uplo, std::int64_t n, double alpha, + const double *a, std::int64_t lda, const double *x, std::int64_t incx, + double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event syr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha, + const float *x, std::int64_t incx, float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event syr(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha, + const double *x, std::int64_t incx, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event syr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, float alpha, + const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event syr2(cl::sycl::queue &queue, onemkl::uplo upplo, std::int64_t n, double alpha, + const double *x, std::int64_t incx, const double *y, std::int64_t incy, + double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, std::int64_t k, const float *a, + std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, std::int64_t k, const double *a, + std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, std::int64_t k, + const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tbmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, std::int64_t k, + const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, std::int64_t k, const float *a, + std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, std::int64_t k, const double *a, + std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, std::int64_t k, + const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tbsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, std::int64_t k, + const std::complex *a, std::int64_t lda, std::complex *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const float *a, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const double *a, double *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const std::complex *a, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tpmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const std::complex *a, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const float *a, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const double *a, double *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const std::complex *a, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event tpsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const std::complex *a, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const float *a, std::int64_t lda, float *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const double *a, std::int64_t lda, + double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trmv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const float *a, std::int64_t lda, float *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const double *a, std::int64_t lda, + double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event trsv(cl::sycl::queue &queue, onemkl::uplo upplo, onemkl::transpose trans, + onemkl::diag diag, std::int64_t n, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + float *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event asum(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + double *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, float alpha, const float *x, + std::int64_t incx, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, double alpha, const double *x, + std::int64_t incx, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event axpy(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event copy(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + const float *y, std::int64_t incy, float *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + const double *y, std::int64_t incy, double *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event sdsdot(cl::sycl::queue &queue, std::int64_t n, float sb, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, float *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event dot(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + const float *y, std::int64_t incy, double *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event dotc(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event dotc(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event dotu(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event dotu(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, const std::complex *y, std::int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + float *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event nrm2(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + double *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, float c, float s, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, double c, + double s, const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, float c, float s, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event rot(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, + std::int64_t incy, double c, double s, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event rotg(cl::sycl::queue &queue, float *a, float *b, float *c, float *s, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event rotg(cl::sycl::queue &queue, double *a, double *b, double *c, double *s, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event rotg(cl::sycl::queue &queue, std::complex *a, std::complex *b, + float *c, std::complex *s, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event rotg(cl::sycl::queue &queue, std::complex *a, std::complex *b, + double *c, std::complex *s, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event rotm(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, float *param, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event rotm(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, + double *y, std::int64_t incy, double *param, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event rotmg(cl::sycl::queue &queue, float *d1, float *d2, float *x1, float y1, + float *param, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event rotmg(cl::sycl::queue &queue, double *d1, double *d2, double *x1, double y1, + double *param, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, float alpha, float *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, double alpha, double *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, float alpha, std::complex *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event scal(cl::sycl::queue &queue, std::int64_t n, double alpha, std::complex *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, + double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event swap(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + std::int64_t *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + std::int64_t *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event iamax(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, + std::int64_t *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, + std::int64_t *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event iamin(cl::sycl::queue &queue, std::int64_t n, const std::complex *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose *transa, + onemkl::transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, float *alpha, const float **a, std::int64_t *lda, + const float **b, std::int64_t *ldb, float *beta, float **c, + std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose *transa, + onemkl::transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, double *alpha, const double **a, std::int64_t *lda, + const double **b, std::int64_t *ldb, double *beta, double **c, + std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose *transa, + onemkl::transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, std::complex *alpha, + const std::complex **a, std::int64_t *lda, + const std::complex **b, std::int64_t *ldb, + std::complex *beta, std::complex **c, std::int64_t *ldc, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose *transa, + onemkl::transpose *transb, std::int64_t *m, std::int64_t *n, + std::int64_t *k, std::complex *alpha, + const std::complex **a, std::int64_t *lda, + const std::complex **b, std::int64_t *ldb, + std::complex *beta, std::complex **c, std::int64_t *ldc, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa, + onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + float alpha, const float *a, std::int64_t lda, std::int64_t stride_a, + const float *b, std::int64_t ldb, std::int64_t stride_b, float beta, + float *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa, + onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + double alpha, const double *a, std::int64_t lda, std::int64_t stride_a, + const double *b, std::int64_t ldb, std::int64_t stride_b, double beta, + double *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa, + onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, + std::int64_t lda, std::int64_t stride_a, const std::complex *b, + std::int64_t ldb, std::int64_t stride_b, std::complex beta, + std::complex *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemm_batch(cl::sycl::queue &queue, onemkl::transpose transa, + onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, + std::int64_t lda, std::int64_t stride_a, const std::complex *b, + std::int64_t ldb, std::int64_t stride_b, std::complex beta, + std::complex *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, float *alpha, const float **x, + std::int64_t *incx, float **y, std::int64_t *incy, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, double *alpha, const double **x, + std::int64_t *incx, double **y, std::int64_t *incy, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, std::complex *alpha, + const std::complex **x, std::int64_t *incx, + std::complex **y, std::int64_t *incy, std::int64_t group_count, + std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, std::complex *alpha, + const std::complex **x, std::int64_t *incx, + std::complex **y, std::int64_t *incy, std::int64_t group_count, + std::int64_t *group_size, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa, + onemkl::transpose transb, std::int64_t n, std::int64_t k, float alpha, + const float *a, std::int64_t lda, const float *b, std::int64_t ldb, + float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa, + onemkl::transpose transb, std::int64_t n, std::int64_t k, double alpha, + const double *a, std::int64_t lda, const double *b, std::int64_t ldb, + double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa, + onemkl::transpose transb, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + +cl::sycl::event gemmt(cl::sycl::queue &queue, onemkl::uplo upper_lower, onemkl::transpose transa, + onemkl::transpose transb, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies = {}); + } //namespace internal } //namespace mklgpu } //namespace onemkl diff --git a/src/blas/backends/mklgpu/mkl_internal_blas_sycl_gpu.hpp b/src/blas/backends/mklgpu/mkl_internal_blas_sycl_gpu.hpp index 1c4e54c46..6a68a73a0 100644 --- a/src/blas/backends/mklgpu/mkl_internal_blas_sycl_gpu.hpp +++ b/src/blas/backends/mklgpu/mkl_internal_blas_sycl_gpu.hpp @@ -33,12 +33,6 @@ typedef enum { MKL_NONUNIT = 131, MKL_UNIT = 132 } MKL_DIAG; typedef enum { MKL_LEFT = 141, MKL_RIGHT = 142 } MKL_SIDE; -typedef enum { - MKL_COMPACT_SSE = 181, - MKL_COMPACT_AVX = 182, - MKL_COMPACT_AVX512 = 183 -} MKL_COMPACT_PACK; - enum CBLAS_OFFSET { CblasRowOffset = 171, CblasColOffset = 172, CblasFixOffset = 173 }; typedef enum CBLAS_OFFSET CBLAS_OFFSET; @@ -88,7 +82,7 @@ inline CBLAS_OFFSET cblas_convert(onemkl::offset o) { namespace gpu { -// gemm +// Buffer APIs void sgemm(cl::sycl::queue &queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb, int64_t m, int64_t n, int64_t k, float alpha, cl::sycl::buffer &a, int64_t lda, @@ -110,8 +104,6 @@ void zgemm(cl::sycl::queue &queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb, i int64_t lda, cl::sycl::buffer, 1> &b, int64_t ldb, std::complex beta, cl::sycl::buffer, 1> &c, int64_t ldc); -// symm - void ssymm(cl::sycl::queue &queue, MKL_SIDE left_right, MKL_UPLO upper_lower, int64_t m, int64_t n, float alpha, cl::sycl::buffer &a, int64_t lda, cl::sycl::buffer &b, int64_t ldb, float beta, cl::sycl::buffer &c, int64_t ldc); @@ -131,8 +123,6 @@ void zsymm(cl::sycl::queue &queue, MKL_SIDE left_right, MKL_UPLO upper_lower, in cl::sycl::buffer, 1> &b, int64_t ldb, std::complex beta, cl::sycl::buffer, 1> &c, int64_t ldc); -// hemm - void chemm(cl::sycl::queue &queue, MKL_SIDE left_right, MKL_UPLO upper_lower, int64_t m, int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, int64_t lda, cl::sycl::buffer, 1> &b, int64_t ldb, std::complex beta, @@ -143,7 +133,6 @@ void zhemm(cl::sycl::queue &queue, MKL_SIDE left_right, MKL_UPLO upper_lower, in cl::sycl::buffer, 1> &b, int64_t ldb, std::complex beta, cl::sycl::buffer, 1> &c, int64_t ldc); -// syrk void ssyrk(cl::sycl::queue &queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans, int64_t n, int64_t k, float alpha, cl::sycl::buffer &a, int64_t lda, float beta, cl::sycl::buffer &c, int64_t ldc); @@ -160,8 +149,6 @@ void zsyrk(cl::sycl::queue &queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans, in std::complex alpha, cl::sycl::buffer, 1> &a, int64_t lda, std::complex beta, cl::sycl::buffer, 1> &c, int64_t ldc); -// herk - void cherk(cl::sycl::queue &queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans, int64_t n, int64_t k, float alpha, cl::sycl::buffer, 1> &a, int64_t lda, float beta, cl::sycl::buffer, 1> &c, int64_t ldc); @@ -170,8 +157,6 @@ void zherk(cl::sycl::queue &queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans, in double alpha, cl::sycl::buffer, 1> &a, int64_t lda, double beta, cl::sycl::buffer, 1> &c, int64_t ldc); -// syr2k - void ssyr2k(cl::sycl::queue &queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans, int64_t n, int64_t k, float alpha, cl::sycl::buffer &a, int64_t lda, cl::sycl::buffer &b, int64_t ldb, float beta, cl::sycl::buffer &c, int64_t ldc); @@ -191,8 +176,6 @@ void zsyr2k(cl::sycl::queue &queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans, i cl::sycl::buffer, 1> &b, int64_t ldb, std::complex beta, cl::sycl::buffer, 1> &c, int64_t ldc); -// her2k - void cher2k(cl::sycl::queue &queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans, int64_t n, int64_t k, std::complex alpha, cl::sycl::buffer, 1> &a, int64_t lda, cl::sycl::buffer, 1> &b, int64_t ldb, float beta, @@ -203,8 +186,6 @@ void zher2k(cl::sycl::queue &queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans, i cl::sycl::buffer, 1> &b, int64_t ldb, double beta, cl::sycl::buffer, 1> &c, int64_t ldc); -// trmm - void strmm(cl::sycl::queue &queue, MKL_SIDE left_right, MKL_UPLO upper_lower, MKL_TRANSPOSE transa, MKL_DIAG unit_diag, int64_t m, int64_t n, float alpha, cl::sycl::buffer &a, int64_t lda, cl::sycl::buffer &b, int64_t ldb); @@ -223,7 +204,6 @@ void ztrmm(cl::sycl::queue &queue, MKL_SIDE left_right, MKL_UPLO upper_lower, MK cl::sycl::buffer, 1> &a, int64_t lda, cl::sycl::buffer, 1> &b, int64_t ldb); -// trsm void strsm(cl::sycl::queue &queue, MKL_SIDE left_right, MKL_UPLO upper_lower, MKL_TRANSPOSE transa, MKL_DIAG unit_diag, int64_t m, int64_t n, float alpha, cl::sycl::buffer &a, int64_t lda, cl::sycl::buffer &b, int64_t ldb); @@ -242,8 +222,6 @@ void ztrsm(cl::sycl::queue &queue, MKL_SIDE left_right, MKL_UPLO upper_lower, MK cl::sycl::buffer, 1> &a, int64_t lda, cl::sycl::buffer, 1> &b, int64_t ldb); -// Level2 - void sgemv(cl::sycl::queue &queue, MKL_TRANSPOSE trans, int64_t m, int64_t n, float alpha, cl::sycl::buffer &a, int64_t lda, cl::sycl::buffer &x, int64_t incx, float beta, cl::sycl::buffer &y, int64_t incy); @@ -519,8 +497,6 @@ void ztrsv(cl::sycl::queue &queue, MKL_UPLO upplo, MKL_TRANSPOSE trans, MKL_DIAG cl::sycl::buffer, 1> &a, int64_t lda, cl::sycl::buffer, 1> &x, int64_t incx); -// Level1 - void scasum(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer, 1> &x, int64_t incx, cl::sycl::buffer &result); @@ -680,89 +656,6 @@ void idamax(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, i void icamax(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer, 1> &x, int64_t incx, cl::sycl::buffer &result); -void dnrm2(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, int64_t incx, - cl::sycl::buffer &result); - -void csrot(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer, 1> &x, - int64_t incx, cl::sycl::buffer, 1> &y, int64_t incy, float c, - float s); - -void zdrot(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer, 1> &x, - int64_t incx, cl::sycl::buffer, 1> &y, int64_t incy, double c, - double s); - -void srot(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, int64_t incx, - cl::sycl::buffer &y, int64_t incy, float c, float s); - -void drot(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, int64_t incx, - cl::sycl::buffer &y, int64_t incy, double c, double s); - -void srotg(cl::sycl::queue &queue, cl::sycl::buffer &a, cl::sycl::buffer &b, - cl::sycl::buffer &c, cl::sycl::buffer &s); - -void drotg(cl::sycl::queue &queue, cl::sycl::buffer &a, cl::sycl::buffer &b, - cl::sycl::buffer &c, cl::sycl::buffer &s); - -void crotg(cl::sycl::queue &queue, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &c, - cl::sycl::buffer, 1> &s); - -void zrotg(cl::sycl::queue &queue, cl::sycl::buffer, 1> &a, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &c, - cl::sycl::buffer, 1> &s); - -void srotm(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, int64_t incx, - cl::sycl::buffer &y, int64_t incy, cl::sycl::buffer ¶m); - -void drotm(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, int64_t incx, - cl::sycl::buffer &y, int64_t incy, cl::sycl::buffer ¶m); - -void srotmg(cl::sycl::queue &queue, cl::sycl::buffer &d1, cl::sycl::buffer &d2, - cl::sycl::buffer &x1, float y1, cl::sycl::buffer ¶m); - -void drotmg(cl::sycl::queue &queue, cl::sycl::buffer &d1, - cl::sycl::buffer &d2, cl::sycl::buffer &x1, double y1, - cl::sycl::buffer ¶m); - -void sscal(cl::sycl::queue &queue, int64_t n, float alpha, cl::sycl::buffer &x, - int64_t incx); - -void dscal(cl::sycl::queue &queue, int64_t n, double alpha, cl::sycl::buffer &x, - int64_t incx); - -void cscal(cl::sycl::queue &queue, int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &x, int64_t incx); - -void zscal(cl::sycl::queue &queue, int64_t n, std::complex alpha, - cl::sycl::buffer, 1> &x, int64_t incx); - -void csscal(cl::sycl::queue &queue, int64_t n, float alpha, - cl::sycl::buffer, 1> &x, int64_t incx); - -void zdscal(cl::sycl::queue &queue, int64_t n, double alpha, - cl::sycl::buffer, 1> &x, int64_t incx); - -void sswap(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, int64_t incx, - cl::sycl::buffer &y, int64_t incy); - -void dswap(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, int64_t incx, - cl::sycl::buffer &y, int64_t incy); - -void cswap(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer, 1> &x, - int64_t incx, cl::sycl::buffer, 1> &y, int64_t incy); - -void zswap(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer, 1> &x, - int64_t incx, cl::sycl::buffer, 1> &y, int64_t incy); - -void isamax(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, int64_t incx, - cl::sycl::buffer &result); - -void idamax(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer &x, int64_t incx, - cl::sycl::buffer &result); - -void icamax(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer, 1> &x, - int64_t incx, cl::sycl::buffer &result); - void izamax(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer, 1> &x, int64_t incx, cl::sycl::buffer &result); @@ -778,8 +671,6 @@ void icamin(cl::sycl::queue &queue, int64_t n, cl::sycl::buffer, 1> &x, int64_t incx, cl::sycl::buffer &result); -// batch api - void sgemm_batch(cl::sycl::queue &queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb, int64_t m, int64_t n, int64_t k, float alpha, cl::sycl::buffer &a, int64_t lda, int64_t stride_a, cl::sycl::buffer &b, int64_t ldb, int64_t stride_b, @@ -836,8 +727,6 @@ void ztrsm_batch(cl::sycl::queue &queue, MKL_SIDE left_right, MKL_UPLO upper_low int64_t ldb, int64_t stride_b, int64_t batch_size, int64_t offset_a = 0, int64_t offset_b = 0); -// BLAS like extension - void sgemmt(cl::sycl::queue &queue, MKL_UPLO upper_lower, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb, int64_t n, int64_t k, float alpha, cl::sycl::buffer &a, int64_t lda, cl::sycl::buffer &b, int64_t ldb, float beta, @@ -875,6 +764,869 @@ void gemm_s8u8s32(cl::sycl::queue &queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE tr cl::sycl::buffer &a, int64_t lda, int8_t ao, cl::sycl::buffer &b, int64_t ldb, uint8_t bo, float beta, cl::sycl::buffer &c, int64_t ldc, cl::sycl::buffer &co); + +// USM APIs + +cl::sycl::event sgemm_sycl(cl::sycl::queue *queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb, + int64_t m, int64_t n, int64_t k, float alpha, const float *a, + int64_t lda, const float *b, int64_t ldb, float beta, float *c, + int64_t ldc, const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0); + +cl::sycl::event dgemm_sycl(cl::sycl::queue *queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb, + int64_t m, int64_t n, int64_t k, double alpha, const double *a, + int64_t lda, const double *b, int64_t ldb, double beta, double *c, + int64_t ldc, const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0); + +cl::sycl::event cgemm_sycl(cl::sycl::queue *queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb, + int64_t m, int64_t n, int64_t k, std::complex alpha, + const std::complex *a, int64_t lda, const std::complex *b, + int64_t ldb, std::complex beta, std::complex *c, + int64_t ldc, const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0); + +cl::sycl::event zgemm_sycl(cl::sycl::queue *queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb, + int64_t m, int64_t n, int64_t k, std::complex alpha, + const std::complex *a, int64_t lda, + const std::complex *b, int64_t ldb, std::complex beta, + std::complex *c, int64_t ldc, + const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0); + +cl::sycl::event ssymm_sycl(cl::sycl::queue *queue, MKL_SIDE left_right, MKL_UPLO upper_lower, + int64_t m, int64_t n, float alpha, const float *a, int64_t lda, + const float *b, int64_t ldb, float beta, float *c, int64_t ldc, + const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0); + +cl::sycl::event dsymm_sycl(cl::sycl::queue *queue, MKL_SIDE left_right, MKL_UPLO upper_lower, + int64_t m, int64_t n, double alpha, const double *a, int64_t lda, + const double *b, int64_t ldb, double beta, double *c, int64_t ldc, + const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0); + +cl::sycl::event csymm_sycl(cl::sycl::queue *queue, MKL_SIDE left_right, MKL_UPLO upper_lower, + int64_t m, int64_t n, std::complex alpha, + const std::complex *a, int64_t lda, const std::complex *b, + int64_t ldb, std::complex beta, std::complex *c, + int64_t ldc, const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0); + +cl::sycl::event zsymm_sycl(cl::sycl::queue *queue, MKL_SIDE left_right, MKL_UPLO upper_lower, + int64_t m, int64_t n, std::complex alpha, + const std::complex *a, int64_t lda, + const std::complex *b, int64_t ldb, std::complex beta, + std::complex *c, int64_t ldc, + const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0); + +cl::sycl::event chemm_sycl(cl::sycl::queue *queue, MKL_SIDE left_right, MKL_UPLO upper_lower, + int64_t m, int64_t n, std::complex alpha, + const std::complex *a, int64_t lda, const std::complex *b, + int64_t ldb, std::complex beta, std::complex *c, + int64_t ldc, const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0); + +cl::sycl::event zhemm_sycl(cl::sycl::queue *queue, MKL_SIDE left_right, MKL_UPLO upper_lower, + int64_t m, int64_t n, std::complex alpha, + const std::complex *a, int64_t lda, + const std::complex *b, int64_t ldb, std::complex beta, + std::complex *c, int64_t ldc, + const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0); + +cl::sycl::event ssyrk_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans, + int64_t n, int64_t k, float alpha, const float *a, int64_t lda, + float beta, float *c, int64_t ldc, + const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_c = 0); + +cl::sycl::event dsyrk_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans, + int64_t n, int64_t k, double alpha, const double *a, int64_t lda, + double beta, double *c, int64_t ldc, + const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_c = 0); + +cl::sycl::event csyrk_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans, + int64_t n, int64_t k, std::complex alpha, + const std::complex *a, int64_t lda, std::complex beta, + std::complex *c, int64_t ldc, + const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_c = 0); + +cl::sycl::event zsyrk_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans, + int64_t n, int64_t k, std::complex alpha, + const std::complex *a, int64_t lda, std::complex beta, + std::complex *c, int64_t ldc, + const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_c = 0); + +cl::sycl::event cherk_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans, + int64_t n, int64_t k, float alpha, const std::complex *a, + int64_t lda, float beta, std::complex *c, int64_t ldc, + const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_c = 0); + +cl::sycl::event zherk_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans, + int64_t n, int64_t k, double alpha, const std::complex *a, + int64_t lda, double beta, std::complex *c, int64_t ldc, + const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_c = 0); + +cl::sycl::event ssyr2k_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans, + int64_t n, int64_t k, float alpha, const float *a, int64_t lda, + const float *b, int64_t ldb, float beta, float *c, int64_t ldc, + const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0); + +cl::sycl::event dsyr2k_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans, + int64_t n, int64_t k, double alpha, const double *a, int64_t lda, + const double *b, int64_t ldb, double beta, double *c, int64_t ldc, + const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0); + +cl::sycl::event csyr2k_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans, + int64_t n, int64_t k, std::complex alpha, + const std::complex *a, int64_t lda, const std::complex *b, + int64_t ldb, std::complex beta, std::complex *c, + int64_t ldc, + const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0); + +cl::sycl::event zsyr2k_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans, + int64_t n, int64_t k, std::complex alpha, + const std::complex *a, int64_t lda, + const std::complex *b, int64_t ldb, std::complex beta, + std::complex *c, int64_t ldc, + const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0); + +cl::sycl::event cher2k_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans, + int64_t n, int64_t k, std::complex alpha, + const std::complex *a, int64_t lda, const std::complex *b, + int64_t ldb, float beta, std::complex *c, int64_t ldc, + const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0); + +cl::sycl::event zher2k_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE trans, + int64_t n, int64_t k, std::complex alpha, + const std::complex *a, int64_t lda, + const std::complex *b, int64_t ldb, double beta, + std::complex *c, int64_t ldc, + const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0); + +cl::sycl::event strmm_sycl(cl::sycl::queue *queue, MKL_SIDE left_right, MKL_UPLO upper_lower, + MKL_TRANSPOSE transa, MKL_DIAG unit_diag, int64_t m, int64_t n, + float alpha, const float *a, int64_t lda, float *b, int64_t ldb, + const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_b = 0); + +cl::sycl::event dtrmm_sycl(cl::sycl::queue *queue, MKL_SIDE left_right, MKL_UPLO upper_lower, + MKL_TRANSPOSE transa, MKL_DIAG unit_diag, int64_t m, int64_t n, + double alpha, const double *a, int64_t lda, double *b, int64_t ldb, + const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_b = 0); + +cl::sycl::event ctrmm_sycl(cl::sycl::queue *queue, MKL_SIDE left_right, MKL_UPLO upper_lower, + MKL_TRANSPOSE transa, MKL_DIAG unit_diag, int64_t m, int64_t n, + std::complex alpha, const std::complex *a, int64_t lda, + std::complex *b, int64_t ldb, + const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_b = 0); + +cl::sycl::event ztrmm_sycl(cl::sycl::queue *queue, MKL_SIDE left_right, MKL_UPLO upper_lower, + MKL_TRANSPOSE transa, MKL_DIAG unit_diag, int64_t m, int64_t n, + std::complex alpha, const std::complex *a, int64_t lda, + std::complex *b, int64_t ldb, + const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_b = 0); + +cl::sycl::event strsm_sycl(cl::sycl::queue *queue, MKL_SIDE left_right, MKL_UPLO upper_lower, + MKL_TRANSPOSE transa, MKL_DIAG unit_diag, int64_t m, int64_t n, + float alpha, const float *a, int64_t lda, float *b, int64_t ldb, + const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_b = 0); + +cl::sycl::event dtrsm_sycl(cl::sycl::queue *queue, MKL_SIDE left_right, MKL_UPLO upper_lower, + MKL_TRANSPOSE transa, MKL_DIAG unit_diag, int64_t m, int64_t n, + double alpha, const double *a, int64_t lda, double *b, int64_t ldb, + const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_b = 0); + +cl::sycl::event ctrsm_sycl(cl::sycl::queue *queue, MKL_SIDE left_right, MKL_UPLO upper_lower, + MKL_TRANSPOSE transa, MKL_DIAG unit_diag, int64_t m, int64_t n, + std::complex alpha, const std::complex *a, int64_t lda, + std::complex *b, int64_t ldb, + const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_b = 0); + +cl::sycl::event ztrsm_sycl(cl::sycl::queue *queue, MKL_SIDE left_right, MKL_UPLO upper_lower, + MKL_TRANSPOSE transa, MKL_DIAG unit_diag, int64_t m, int64_t n, + std::complex alpha, const std::complex *a, int64_t lda, + std::complex *b, int64_t ldb, + const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_b = 0); + +cl::sycl::event sgemv_sycl(cl::sycl::queue *queue, MKL_TRANSPOSE trans, int64_t m, int64_t n, + float alpha, const float *a, int64_t lda, const float *x, int64_t incx, + float beta, float *y, int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event dgemv_sycl(cl::sycl::queue *queue, MKL_TRANSPOSE trans, int64_t m, int64_t n, + double alpha, const double *a, int64_t lda, const double *x, + int64_t incx, double beta, double *y, int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event cgemv_sycl(cl::sycl::queue *queue, MKL_TRANSPOSE trans, int64_t m, int64_t n, + std::complex alpha, const std::complex *a, int64_t lda, + const std::complex *x, int64_t incx, std::complex beta, + std::complex *y, int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event zgemv_sycl(cl::sycl::queue *queue, MKL_TRANSPOSE trans, int64_t m, int64_t n, + std::complex alpha, const std::complex *a, int64_t lda, + const std::complex *x, int64_t incx, std::complex beta, + std::complex *y, int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event sgbmv_sycl(cl::sycl::queue *queue, MKL_TRANSPOSE trans, int64_t m, int64_t n, + int64_t kl, int64_t ku, float alpha, const float *a, int64_t lda, + const float *x, int64_t incx, float beta, float *y, int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event dgbmv_sycl(cl::sycl::queue *queue, MKL_TRANSPOSE trans, int64_t m, int64_t n, + int64_t kl, int64_t ku, double alpha, const double *a, int64_t lda, + const double *x, int64_t incx, double beta, double *y, int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event cgbmv_sycl(cl::sycl::queue *queue, MKL_TRANSPOSE trans, int64_t m, int64_t n, + int64_t kl, int64_t ku, std::complex alpha, + const std::complex *a, int64_t lda, const std::complex *x, + int64_t incx, std::complex beta, std::complex *y, + int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event zgbmv_sycl(cl::sycl::queue *queue, MKL_TRANSPOSE trans, int64_t m, int64_t n, + int64_t kl, int64_t ku, std::complex alpha, + const std::complex *a, int64_t lda, + const std::complex *x, int64_t incx, std::complex beta, + std::complex *y, int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event sger_sycl(cl::sycl::queue *queue, int64_t m, int64_t n, float alpha, const float *x, + int64_t incx, const float *y, int64_t incy, float *a, int64_t lda, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event dger_sycl(cl::sycl::queue *queue, int64_t m, int64_t n, double alpha, + const double *x, int64_t incx, const double *y, int64_t incy, double *a, + int64_t lda, const cl::sycl::vector_class &dependencies); + +cl::sycl::event cgerc_sycl(cl::sycl::queue *queue, int64_t m, int64_t n, std::complex alpha, + const std::complex *x, int64_t incx, const std::complex *y, + int64_t incy, std::complex *a, int64_t lda, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event zgerc_sycl(cl::sycl::queue *queue, int64_t m, int64_t n, std::complex alpha, + const std::complex *x, int64_t incx, + const std::complex *y, int64_t incy, std::complex *a, + int64_t lda, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event cgeru_sycl(cl::sycl::queue *queue, int64_t m, int64_t n, std::complex alpha, + const std::complex *x, int64_t incx, const std::complex *y, + int64_t incy, std::complex *a, int64_t lda, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event zgeru_sycl(cl::sycl::queue *queue, int64_t m, int64_t n, std::complex alpha, + const std::complex *x, int64_t incx, + const std::complex *y, int64_t incy, std::complex *a, + int64_t lda, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event chbmv_sycl(cl::sycl::queue *queue, MKL_UPLO uplo, int64_t n, int64_t k, + std::complex alpha, const std::complex *a, int64_t lda, + const std::complex *x, int64_t incx, std::complex beta, + std::complex *y, int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event zhbmv_sycl(cl::sycl::queue *queue, MKL_UPLO uplo, int64_t n, int64_t k, + std::complex alpha, const std::complex *a, int64_t lda, + const std::complex *x, int64_t incx, std::complex beta, + std::complex *y, int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event chemv_sycl(cl::sycl::queue *queue, MKL_UPLO uplo, int64_t n, + std::complex alpha, const std::complex *a, int64_t lda, + const std::complex *x, int64_t incx, std::complex beta, + std::complex *y, int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event zhemv_sycl(cl::sycl::queue *queue, MKL_UPLO uplo, int64_t n, + std::complex alpha, const std::complex *a, int64_t lda, + const std::complex *x, int64_t incx, std::complex beta, + std::complex *y, int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event cher_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n, float alpha, + const std::complex *x, int64_t incx, std::complex *a, + int64_t lda, const cl::sycl::vector_class &dependencies); + +cl::sycl::event zher_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n, double alpha, + const std::complex *x, int64_t incx, std::complex *a, + int64_t lda, const cl::sycl::vector_class &dependencies); + +cl::sycl::event cher2_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n, + std::complex alpha, const std::complex *x, int64_t incx, + const std::complex *y, int64_t incy, std::complex *a, + int64_t lda, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event zher2_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n, + std::complex alpha, const std::complex *x, int64_t incx, + const std::complex *y, int64_t incy, std::complex *a, + int64_t lda, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event chpmv_sycl(cl::sycl::queue *queue, MKL_UPLO uplo, int64_t n, + std::complex alpha, const std::complex *a, + const std::complex *x, int64_t incx, std::complex beta, + std::complex *y, int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event zhpmv_sycl(cl::sycl::queue *queue, MKL_UPLO uplo, int64_t n, + std::complex alpha, const std::complex *a, + const std::complex *x, int64_t incx, std::complex beta, + std::complex *y, int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event chpr_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n, float alpha, + const std::complex *x, int64_t incx, std::complex *a, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event zhpr_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n, double alpha, + const std::complex *x, int64_t incx, std::complex *a, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event chpr2_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n, + std::complex alpha, const std::complex *x, int64_t incx, + const std::complex *y, int64_t incy, std::complex *a, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event zhpr2_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n, + std::complex alpha, const std::complex *x, int64_t incx, + const std::complex *y, int64_t incy, std::complex *a, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event ssbmv_sycl(cl::sycl::queue *queue, MKL_UPLO uplo, int64_t n, int64_t k, float alpha, + const float *a, int64_t lda, const float *x, int64_t incx, float beta, + float *y, int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event dsbmv_sycl(cl::sycl::queue *queue, MKL_UPLO uplo, int64_t n, int64_t k, + double alpha, const double *a, int64_t lda, const double *x, + int64_t incx, double beta, double *y, int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event sspmv_sycl(cl::sycl::queue *queue, MKL_UPLO uplo, int64_t n, float alpha, + const float *a, const float *x, int64_t incx, float beta, float *y, + int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event dspmv_sycl(cl::sycl::queue *queue, MKL_UPLO uplo, int64_t n, double alpha, + const double *a, const double *x, int64_t incx, double beta, double *y, + int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event sspr_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n, float alpha, + const float *x, int64_t incx, float *a, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event dspr_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n, double alpha, + const double *x, int64_t incx, double *a, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event sspr2_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n, float alpha, + const float *x, int64_t incx, const float *y, int64_t incy, float *a, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event dspr2_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n, double alpha, + const double *x, int64_t incx, const double *y, int64_t incy, double *a, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event ssymv_sycl(cl::sycl::queue *queue, MKL_UPLO uplo, int64_t n, float alpha, + const float *a, int64_t lda, const float *x, int64_t incx, float beta, + float *y, int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event dsymv_sycl(cl::sycl::queue *queue, MKL_UPLO uplo, int64_t n, double alpha, + const double *a, int64_t lda, const double *x, int64_t incx, double beta, + double *y, int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event ssyr_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n, float alpha, + const float *x, int64_t incx, float *a, int64_t lda, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event dsyr_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n, double alpha, + const double *x, int64_t incx, double *a, int64_t lda, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event ssyr2_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n, float alpha, + const float *x, int64_t incx, const float *y, int64_t incy, float *a, + int64_t lda, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event dsyr2_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, int64_t n, double alpha, + const double *x, int64_t incx, const double *y, int64_t incy, double *a, + int64_t lda, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event stbmv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans, + MKL_DIAG diag, int64_t n, int64_t k, const float *a, int64_t lda, + float *x, int64_t incx, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event dtbmv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans, + MKL_DIAG diag, int64_t n, int64_t k, const double *a, int64_t lda, + double *x, int64_t incx, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event ctbmv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans, + MKL_DIAG diag, int64_t n, int64_t k, const std::complex *a, + int64_t lda, std::complex *x, int64_t incx, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event ztbmv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans, + MKL_DIAG diag, int64_t n, int64_t k, const std::complex *a, + int64_t lda, std::complex *x, int64_t incx, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event stbsv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans, + MKL_DIAG diag, int64_t n, int64_t k, const float *a, int64_t lda, + float *x, int64_t incx, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event dtbsv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans, + MKL_DIAG diag, int64_t n, int64_t k, const double *a, int64_t lda, + double *x, int64_t incx, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event ctbsv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans, + MKL_DIAG diag, int64_t n, int64_t k, const std::complex *a, + int64_t lda, std::complex *x, int64_t incx, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event ztbsv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans, + MKL_DIAG diag, int64_t n, int64_t k, const std::complex *a, + int64_t lda, std::complex *x, int64_t incx, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event stpmv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans, + MKL_DIAG diag, int64_t n, const float *a, float *x, int64_t incx, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event dtpmv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans, + MKL_DIAG diag, int64_t n, const double *a, double *x, int64_t incx, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event ctpmv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans, + MKL_DIAG diag, int64_t n, const std::complex *a, + std::complex *x, int64_t incx, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event ztpmv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans, + MKL_DIAG diag, int64_t n, const std::complex *a, + std::complex *x, int64_t incx, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event stpsv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans, + MKL_DIAG diag, int64_t n, const float *a, float *x, int64_t incx, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event dtpsv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans, + MKL_DIAG diag, int64_t n, const double *a, double *x, int64_t incx, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event ctpsv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans, + MKL_DIAG diag, int64_t n, const std::complex *a, + std::complex *x, int64_t incx, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event ztpsv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans, + MKL_DIAG diag, int64_t n, const std::complex *a, + std::complex *x, int64_t incx, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event strmv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans, + MKL_DIAG diag, int64_t n, const float *a, int64_t lda, float *x, + int64_t incx, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event dtrmv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans, + MKL_DIAG diag, int64_t n, const double *a, int64_t lda, double *x, + int64_t incx, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event ctrmv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans, + MKL_DIAG diag, int64_t n, const std::complex *a, int64_t lda, + std::complex *x, int64_t incx, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event ztrmv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans, + MKL_DIAG diag, int64_t n, const std::complex *a, int64_t lda, + std::complex *x, int64_t incx, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event strsv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans, + MKL_DIAG diag, int64_t n, const float *a, int64_t lda, float *x, + int64_t incx, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event dtrsv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans, + MKL_DIAG diag, int64_t n, const double *a, int64_t lda, double *x, + int64_t incx, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event ctrsv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans, + MKL_DIAG diag, int64_t n, const std::complex *a, int64_t lda, + std::complex *x, int64_t incx, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event ztrsv_sycl(cl::sycl::queue *queue, MKL_UPLO upplo, MKL_TRANSPOSE trans, + MKL_DIAG diag, int64_t n, const std::complex *a, int64_t lda, + std::complex *x, int64_t incx, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event scasum_sycl(cl::sycl::queue *queue, int64_t n, const std::complex *x, + int64_t incx, float *result, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event dzasum_sycl(cl::sycl::queue *queue, int64_t n, const std::complex *x, + int64_t incx, double *result, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event sasum_sycl(cl::sycl::queue *queue, int64_t n, const float *x, int64_t incx, + float *result, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event dasum_sycl(cl::sycl::queue *queue, int64_t n, const double *x, int64_t incx, + double *result, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event saxpy_sycl(cl::sycl::queue *queue, int64_t n, float alpha, const float *x, + int64_t incx, float *y, int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event daxpy_sycl(cl::sycl::queue *queue, int64_t n, double alpha, const double *x, + int64_t incx, double *y, int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event caxpy_sycl(cl::sycl::queue *queue, int64_t n, std::complex alpha, + const std::complex *x, int64_t incx, std::complex *y, + int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event zaxpy_sycl(cl::sycl::queue *queue, int64_t n, std::complex alpha, + const std::complex *x, int64_t incx, std::complex *y, + int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event scopy_sycl(cl::sycl::queue *queue, int64_t n, const float *x, int64_t incx, + float *y, int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event dcopy_sycl(cl::sycl::queue *queue, int64_t n, const double *x, int64_t incx, + double *y, int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event ccopy_sycl(cl::sycl::queue *queue, int64_t n, const std::complex *x, + int64_t incx, std::complex *y, int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event zcopy_sycl(cl::sycl::queue *queue, int64_t n, const std::complex *x, + int64_t incx, std::complex *y, int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event sdot_sycl(cl::sycl::queue *queue, int64_t n, const float *x, int64_t incx, + const float *y, int64_t incy, float *result, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event ddot_sycl(cl::sycl::queue *queue, int64_t n, const double *x, int64_t incx, + const double *y, int64_t incy, double *result, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event sdsdot_sycl(cl::sycl::queue *queue, int64_t n, float sb, const float *x, + int64_t incx, const float *y, int64_t incy, float *result, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event dsdot_sycl(cl::sycl::queue *queue, int64_t n, const float *x, int64_t incx, + const float *y, int64_t incy, double *result, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event cdotc_sycl(cl::sycl::queue *queue, int64_t n, const std::complex *x, + int64_t incx, const std::complex *y, int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event zdotc_sycl(cl::sycl::queue *queue, int64_t n, const std::complex *x, + int64_t incx, const std::complex *y, int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event cdotu_sycl(cl::sycl::queue *queue, int64_t n, const std::complex *x, + int64_t incx, const std::complex *y, int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event zdotu_sycl(cl::sycl::queue *queue, int64_t n, const std::complex *x, + int64_t incx, const std::complex *y, int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event scnrm2_sycl(cl::sycl::queue *queue, int64_t n, const std::complex *x, + int64_t incx, float *result, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event dznrm2_sycl(cl::sycl::queue *queue, int64_t n, const std::complex *x, + int64_t incx, double *result, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event snrm2_sycl(cl::sycl::queue *queue, int64_t n, const float *x, int64_t incx, + float *result, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event dnrm2_sycl(cl::sycl::queue *queue, int64_t n, const double *x, int64_t incx, + double *result, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event csrot_sycl(cl::sycl::queue *queue, int64_t n, std::complex *x, int64_t incx, + std::complex *y, int64_t incy, float c, float s, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event zdrot_sycl(cl::sycl::queue *queue, int64_t n, std::complex *x, int64_t incx, + std::complex *y, int64_t incy, double c, double s, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event srot_sycl(cl::sycl::queue *queue, int64_t n, float *x, int64_t incx, float *y, + int64_t incy, float c, float s, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event drot_sycl(cl::sycl::queue *queue, int64_t n, double *x, int64_t incx, double *y, + int64_t incy, double c, double s, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event srotg_sycl(cl::sycl::queue *queue, float *a, float *b, float *c, float *s, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event drotg_sycl(cl::sycl::queue *queue, double *a, double *b, double *c, double *s, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event crotg_sycl(cl::sycl::queue *queue, std::complex *a, std::complex *b, + float *c, std::complex *s, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event zrotg_sycl(cl::sycl::queue *queue, std::complex *a, std::complex *b, + double *c, std::complex *s, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event srotm_sycl(cl::sycl::queue *queue, int64_t n, float *x, int64_t incx, float *y, + int64_t incy, float *param, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event drotm_sycl(cl::sycl::queue *queue, int64_t n, double *x, int64_t incx, double *y, + int64_t incy, double *param, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event srotmg_sycl(cl::sycl::queue *queue, float *d1, float *d2, float *x1, float y1, + float *param, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event drotmg_sycl(cl::sycl::queue *queue, double *d1, double *d2, double *x1, double y1, + double *param, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event sscal_sycl(cl::sycl::queue *queue, int64_t n, float alpha, float *x, int64_t incx, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event dscal_sycl(cl::sycl::queue *queue, int64_t n, double alpha, double *x, int64_t incx, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event cscal_sycl(cl::sycl::queue *queue, int64_t n, std::complex alpha, + std::complex *x, int64_t incx, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event zscal_sycl(cl::sycl::queue *queue, int64_t n, std::complex alpha, + std::complex *x, int64_t incx, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event csscal_sycl(cl::sycl::queue *queue, int64_t n, float alpha, std::complex *x, + int64_t incx, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event zdscal_sycl(cl::sycl::queue *queue, int64_t n, double alpha, + std::complex *x, int64_t incx, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event sswap_sycl(cl::sycl::queue *queue, int64_t n, float *x, int64_t incx, float *y, + int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event dswap_sycl(cl::sycl::queue *queue, int64_t n, double *x, int64_t incx, double *y, + int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event cswap_sycl(cl::sycl::queue *queue, int64_t n, std::complex *x, int64_t incx, + std::complex *y, int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event zswap_sycl(cl::sycl::queue *queue, int64_t n, std::complex *x, int64_t incx, + std::complex *y, int64_t incy, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event isamax_sycl(cl::sycl::queue *queue, int64_t n, const float *x, int64_t incx, + int64_t *result, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event idamax_sycl(cl::sycl::queue *queue, int64_t n, const double *x, int64_t incx, + int64_t *result, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event icamax_sycl(cl::sycl::queue *queue, int64_t n, const std::complex *x, + int64_t incx, int64_t *result, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event izamax_sycl(cl::sycl::queue *queue, int64_t n, const std::complex *x, + int64_t incx, int64_t *result, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event isamin_sycl(cl::sycl::queue *queue, int64_t n, const float *x, int64_t incx, + int64_t *result, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event idamin_sycl(cl::sycl::queue *queue, int64_t n, const double *x, int64_t incx, + int64_t *result, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event icamin_sycl(cl::sycl::queue *queue, int64_t n, const std::complex *x, + int64_t incx, int64_t *result, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event izamin_sycl(cl::sycl::queue *queue, int64_t n, const std::complex *x, + int64_t incx, int64_t *result, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event sgemm_batch(cl::sycl::queue &queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb, + int64_t m, int64_t n, int64_t k, float alpha, const float *a, + int64_t lda, int64_t strideA, const float *b, int64_t ldb, + int64_t strideB, float beta, float *c, int64_t ldc, int64_t strideC, + int64_t group_size, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event dgemm_batch(cl::sycl::queue &queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb, + int64_t m, int64_t n, int64_t k, double alpha, const double *a, + int64_t lda, int64_t strideA, const double *b, int64_t ldb, + int64_t strideB, double beta, double *c, int64_t ldc, int64_t strideC, + int64_t group_size, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event cgemm_batch(cl::sycl::queue &queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb, + int64_t m, int64_t n, int64_t k, std::complex alpha, + const std::complex *a, int64_t lda, int64_t strideA, + const std::complex *b, int64_t ldb, int64_t strideB, + std::complex beta, std::complex *c, int64_t ldc, + int64_t strideC, int64_t group_size, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event zgemm_batch(cl::sycl::queue &queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb, + int64_t m, int64_t n, int64_t k, std::complex alpha, + const std::complex *a, int64_t lda, int64_t strideA, + const std::complex *b, int64_t ldb, int64_t strideB, + std::complex beta, std::complex *c, int64_t ldc, + int64_t strideC, int64_t group_size, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event sgemm_batch(cl::sycl::queue &queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb, + int64_t m, int64_t n, int64_t k, float alpha, const float **a, + int64_t lda, const float **b, int64_t ldb, float beta, float **c, + int64_t ldc, int64_t offset_batch, int64_t group_size, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event dgemm_batch(cl::sycl::queue &queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb, + int64_t m, int64_t n, int64_t k, double alpha, const double **a, + int64_t lda, const double **b, int64_t ldb, double beta, double **c, + int64_t ldc, int64_t offset_batch, int64_t group_size, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event cgemm_batch(cl::sycl::queue &queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb, + int64_t m, int64_t n, int64_t k, std::complex alpha, + const std::complex **a, int64_t lda, + const std::complex **b, int64_t ldb, std::complex beta, + std::complex **c, int64_t ldc, int64_t offset_batch, + int64_t group_size, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event zgemm_batch(cl::sycl::queue &queue, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb, + int64_t m, int64_t n, int64_t k, std::complex alpha, + const std::complex **a, int64_t lda, + const std::complex **b, int64_t ldb, std::complex beta, + std::complex **c, int64_t ldc, int64_t offset_batch, + int64_t group_size, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event saxpy_batch(cl::sycl::queue &queue, std::int64_t n, float alpha, const float **x, + std::int64_t incx, float **y, std::int64_t incy, + std::int64_t batch_size, std::int64_t offset, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event daxpy_batch(cl::sycl::queue &queue, std::int64_t n, double alpha, const double **x, + std::int64_t incx, double **y, std::int64_t incy, + std::int64_t batch_size, std::int64_t offset, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event caxpy_batch(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + const std::complex **x, std::int64_t incx, + std::complex **y, std::int64_t incy, std::int64_t batch_size, + std::int64_t offset, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event zaxpy_batch(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, + const std::complex **x, std::int64_t incx, + std::complex **y, std::int64_t incy, std::int64_t batch_size, + std::int64_t offset, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event sgemmt_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE transa, + MKL_TRANSPOSE transb, int64_t n, int64_t k, float alpha, const float *a, + int64_t lda, const float *b, int64_t ldb, float beta, float *c, + int64_t ldc, + const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0); + +cl::sycl::event dgemmt_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE transa, + MKL_TRANSPOSE transb, int64_t n, int64_t k, double alpha, + const double *a, int64_t lda, const double *b, int64_t ldb, double beta, + double *c, int64_t ldc, + const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0); + +cl::sycl::event zgemmt_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE transa, + MKL_TRANSPOSE transb, int64_t n, int64_t k, std::complex alpha, + const std::complex *a, int64_t lda, + const std::complex *b, int64_t ldb, std::complex beta, + std::complex *c, int64_t ldc, + const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0); + +cl::sycl::event cgemmt_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE transa, + MKL_TRANSPOSE transb, int64_t n, int64_t k, std::complex alpha, + const std::complex *a, int64_t lda, const std::complex *b, + int64_t ldb, std::complex beta, std::complex *c, + int64_t ldc, + const cl::sycl::vector_class &dependencies, + int64_t offset_a = 0, int64_t offset_b = 0, int64_t offset_c = 0); + } // namespace gpu } // namespace mkl #endif //_MKL_INTERNAL_BLAS_SYCL_GPU_HPP_ diff --git a/src/blas/blas_loader.cpp b/src/blas/blas_loader.cpp index e1dadc8be..045f43a27 100644 --- a/src/blas/blas_loader.cpp +++ b/src/blas/blas_loader.cpp @@ -24,6 +24,8 @@ namespace onemkl { namespace blas { namespace detail { +// Buffer APIs + void asum(char *libname, cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer &result) { @@ -1029,56 +1031,6 @@ void trsm(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_low alpha, a, lda, b, ldb); } -void gemm_batch(char *libname, cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, cl::sycl::buffer &beta, - cl::sycl::buffer &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size) { - function_tables[libname].sgemm_batch_group_sycl(queue, transa, transb, m, n, k, alpha, a, lda, - b, ldb, beta, c, ldc, group_count, group_size); -} - -void gemm_batch(char *libname, cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, cl::sycl::buffer &beta, - cl::sycl::buffer &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size) { - function_tables[libname].dgemm_batch_group_sycl(queue, transa, transb, m, n, k, alpha, a, lda, - b, ldb, beta, c, ldc, group_count, group_size); -} - -void gemm_batch(char *libname, cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size) { - function_tables[libname].cgemm_batch_group_sycl(queue, transa, transb, m, n, k, alpha, a, lda, - b, ldb, beta, c, ldc, group_count, group_size); -} - -void gemm_batch( - char *libname, cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size) { - function_tables[libname].zgemm_batch_group_sycl(queue, transa, transb, m, n, k, alpha, a, lda, - b, ldb, beta, c, ldc, group_count, group_size); -} - void gemm_batch(char *libname, cl::sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, @@ -1125,58 +1077,6 @@ void gemm_batch(char *libname, cl::sycl::queue &queue, transpose transa, transpo stride_c, batch_size); } -void trsm_batch(char *libname, cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { - function_tables[libname].strsm_batch_group_sycl(queue, left_right, upper_lower, trans, - unit_diag, m, n, alpha, a, lda, b, ldb, - group_count, group_size); -} - -void trsm_batch(char *libname, cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &alpha, - cl::sycl::buffer &a, cl::sycl::buffer &lda, - cl::sycl::buffer &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { - function_tables[libname].dtrsm_batch_group_sycl(queue, left_right, upper_lower, trans, - unit_diag, m, n, alpha, a, lda, b, ldb, - group_count, group_size); -} - -void trsm_batch(char *libname, cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, - cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size) { - function_tables[libname].ctrsm_batch_group_sycl(queue, left_right, upper_lower, trans, - unit_diag, m, n, alpha, a, lda, b, ldb, - group_count, group_size); -} - -void trsm_batch(char *libname, cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, cl::sycl::buffer &trans, - cl::sycl::buffer &unit_diag, cl::sycl::buffer &m, - cl::sycl::buffer &n, - cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, - cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, - cl::sycl::buffer &ldb, std::int64_t group_count, - cl::sycl::buffer &group_size) { - function_tables[libname].ztrsm_batch_group_sycl(queue, left_right, upper_lower, trans, - unit_diag, m, n, alpha, a, lda, b, ldb, - group_count, group_size); -} - void trsm_batch(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, cl::sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, @@ -1315,6 +1215,1321 @@ void gemm_ext(char *libname, cl::sycl::queue &queue, transpose transa, transpose beta, c, ldc); } +// USM APIs + +cl::sycl::event asum(char *libname, cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].scasum_usm_sycl(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event asum(char *libname, cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dzasum_usm_sycl(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event asum(char *libname, cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].sasum_usm_sycl(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event asum(char *libname, cl::sycl::queue &queue, std::int64_t n, const double *x, + std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dasum_usm_sycl(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event axpy(char *libname, cl::sycl::queue &queue, std::int64_t n, float alpha, + const float *x, std::int64_t incx, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].saxpy_usm_sycl(queue, n, alpha, x, incx, y, incy, dependencies); +} + +cl::sycl::event axpy(char *libname, cl::sycl::queue &queue, std::int64_t n, double alpha, + const double *x, std::int64_t incx, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].daxpy_usm_sycl(queue, n, alpha, x, incx, y, incy, dependencies); +} + +cl::sycl::event axpy(char *libname, cl::sycl::queue &queue, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].caxpy_usm_sycl(queue, n, alpha, x, incx, y, incy, dependencies); +} + +cl::sycl::event axpy(char *libname, cl::sycl::queue &queue, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zaxpy_usm_sycl(queue, n, alpha, x, incx, y, incy, dependencies); +} + +cl::sycl::event axpy_batch(char *libname, cl::sycl::queue &queue, std::int64_t *n, float *alpha, + const float **x, std::int64_t *incx, float **y, std::int64_t *incy, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].saxpy_batch_group_usm_sycl( + queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies); +} + +cl::sycl::event axpy_batch(char *libname, cl::sycl::queue &queue, std::int64_t *n, double *alpha, + const double **x, std::int64_t *incx, double **y, std::int64_t *incy, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].daxpy_batch_group_usm_sycl( + queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies); +} + +cl::sycl::event axpy_batch(char *libname, cl::sycl::queue &queue, std::int64_t *n, + std::complex *alpha, const std::complex **x, + std::int64_t *incx, std::complex **y, std::int64_t *incy, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].caxpy_batch_group_usm_sycl( + queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies); +} + +cl::sycl::event axpy_batch(char *libname, cl::sycl::queue &queue, std::int64_t *n, + std::complex *alpha, const std::complex **x, + std::int64_t *incx, std::complex **y, std::int64_t *incy, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zaxpy_batch_group_usm_sycl( + queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies); +} + +cl::sycl::event copy(char *libname, cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].scopy_usm_sycl(queue, n, x, incx, y, incy, dependencies); +} + +cl::sycl::event copy(char *libname, cl::sycl::queue &queue, std::int64_t n, const double *x, + std::int64_t incx, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dcopy_usm_sycl(queue, n, x, incx, y, incy, dependencies); +} + +cl::sycl::event copy(char *libname, cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].ccopy_usm_sycl(queue, n, x, incx, y, incy, dependencies); +} + +cl::sycl::event copy(char *libname, cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zcopy_usm_sycl(queue, n, x, incx, y, incy, dependencies); +} + +cl::sycl::event dot(char *libname, cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, float *result, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].sdot_usm_sycl(queue, n, x, incx, y, incy, result, dependencies); +} + +cl::sycl::event dot(char *libname, cl::sycl::queue &queue, std::int64_t n, const double *x, + std::int64_t incx, const double *y, std::int64_t incy, double *result, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].ddot_usm_sycl(queue, n, x, incx, y, incy, result, dependencies); +} + +cl::sycl::event dot(char *libname, cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, double *result, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dsdot_usm_sycl(queue, n, x, incx, y, incy, result, + dependencies); +} + +cl::sycl::event dotc(char *libname, cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].cdotc_usm_sycl(queue, n, x, incx, y, incy, result, + dependencies); +} + +cl::sycl::event dotc(char *libname, cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zdotc_usm_sycl(queue, n, x, incx, y, incy, result, + dependencies); +} + +cl::sycl::event dotu(char *libname, cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].cdotu_usm_sycl(queue, n, x, incx, y, incy, result, + dependencies); +} + +cl::sycl::event dotu(char *libname, cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *result, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zdotu_usm_sycl(queue, n, x, incx, y, incy, result, + dependencies); +} + +cl::sycl::event iamin(char *libname, cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].isamin_usm_sycl(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event iamin(char *libname, cl::sycl::queue &queue, std::int64_t n, const double *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].idamin_usm_sycl(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event iamin(char *libname, cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].icamin_usm_sycl(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event iamin(char *libname, cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].izamin_usm_sycl(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event iamax(char *libname, cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].isamax_usm_sycl(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event iamax(char *libname, cl::sycl::queue &queue, std::int64_t n, const double *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].idamax_usm_sycl(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event iamax(char *libname, cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].icamax_usm_sycl(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event iamax(char *libname, cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].izamax_usm_sycl(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event nrm2(char *libname, cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].snrm2_usm_sycl(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event nrm2(char *libname, cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dnrm2_usm_sycl(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event nrm2(char *libname, cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].scnrm2_usm_sycl(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event nrm2(char *libname, cl::sycl::queue &queue, std::int64_t n, const double *x, + std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dznrm2_usm_sycl(queue, n, x, incx, result, dependencies); +} + +cl::sycl::event rot(char *libname, cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, float c, float s, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].srot_usm_sycl(queue, n, x, incx, y, incy, c, s, dependencies); +} + +cl::sycl::event rot(char *libname, cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, double c, + double s, const cl::sycl::vector_class &dependencies) { + return function_tables[libname].drot_usm_sycl(queue, n, x, incx, y, incy, c, s, dependencies); +} + +cl::sycl::event rot(char *libname, cl::sycl::queue &queue, std::int64_t n, float *x, + std::int64_t incx, float *y, std::int64_t incy, float c, float s, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].csrot_usm_sycl(queue, n, x, incx, y, incy, c, s, dependencies); +} + +cl::sycl::event rot(char *libname, cl::sycl::queue &queue, std::int64_t n, double *x, + std::int64_t incx, double *y, std::int64_t incy, double c, double s, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zdrot_usm_sycl(queue, n, x, incx, y, incy, c, s, dependencies); +} + +cl::sycl::event rotg(char *libname, cl::sycl::queue &queue, float *a, float *b, float *c, float *s, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].srotg_usm_sycl(queue, a, b, c, s, dependencies); +} + +cl::sycl::event rotg(char *libname, cl::sycl::queue &queue, double *a, double *b, double *c, + double *s, const cl::sycl::vector_class &dependencies) { + return function_tables[libname].drotg_usm_sycl(queue, a, b, c, s, dependencies); +} + +cl::sycl::event rotg(char *libname, cl::sycl::queue &queue, std::complex *a, + std::complex *b, float *c, std::complex *s, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].crotg_usm_sycl(queue, a, b, c, s, dependencies); +} + +cl::sycl::event rotg(char *libname, cl::sycl::queue &queue, std::complex *a, + std::complex *b, double *c, std::complex *s, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zrotg_usm_sycl(queue, a, b, c, s, dependencies); +} + +cl::sycl::event rotm(char *libname, cl::sycl::queue &queue, std::int64_t n, float *x, + std::int64_t incx, float *y, std::int64_t incy, float *param, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].srotm_usm_sycl(queue, n, x, incx, y, incy, param, dependencies); +} + +cl::sycl::event rotm(char *libname, cl::sycl::queue &queue, std::int64_t n, double *x, + std::int64_t incx, double *y, std::int64_t incy, double *param, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].drotm_usm_sycl(queue, n, x, incx, y, incy, param, dependencies); +} + +cl::sycl::event rotmg(char *libname, cl::sycl::queue &queue, float *d1, float *d2, float *x1, + float y1, float *param, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].srotmg_usm_sycl(queue, d1, d2, x1, y1, param, dependencies); +} + +cl::sycl::event rotmg(char *libname, cl::sycl::queue &queue, double *d1, double *d2, double *x1, + double y1, double *param, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].drotmg_usm_sycl(queue, d1, d2, x1, y1, param, dependencies); +} + +cl::sycl::event scal(char *libname, cl::sycl::queue &queue, std::int64_t n, float alpha, float *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].sscal_usm_sycl(queue, n, alpha, x, incx, dependencies); +} + +cl::sycl::event scal(char *libname, cl::sycl::queue &queue, std::int64_t n, double alpha, double *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dscal_usm_sycl(queue, n, alpha, x, incx, dependencies); +} + +cl::sycl::event scal(char *libname, cl::sycl::queue &queue, std::int64_t n, + std::complex alpha, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].cscal_usm_sycl(queue, n, alpha, x, incx, dependencies); +} + +cl::sycl::event scal(char *libname, cl::sycl::queue &queue, std::int64_t n, + std::complex alpha, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].csscal_usm_sycl(queue, n, alpha, x, incx, dependencies); +} + +cl::sycl::event scal(char *libname, cl::sycl::queue &queue, std::int64_t n, float alpha, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zscal_usm_sycl(queue, n, alpha, x, incx, dependencies); +} + +cl::sycl::event scal(char *libname, cl::sycl::queue &queue, std::int64_t n, double alpha, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zdscal_usm_sycl(queue, n, alpha, x, incx, dependencies); +} + +cl::sycl::event sdsdot(char *libname, cl::sycl::queue &queue, std::int64_t n, float sb, + const float *x, std::int64_t incx, const float *y, std::int64_t incy, + float *result, const cl::sycl::vector_class &dependencies) { + return function_tables[libname].sdsdot_usm_sycl(queue, n, sb, x, incx, y, incy, result, + dependencies); +} + +cl::sycl::event swap(char *libname, cl::sycl::queue &queue, std::int64_t n, float *x, + std::int64_t incx, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].sswap_usm_sycl(queue, n, x, incx, y, incy, dependencies); +} + +cl::sycl::event swap(char *libname, cl::sycl::queue &queue, std::int64_t n, double *x, + std::int64_t incx, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dswap_usm_sycl(queue, n, x, incx, y, incy, dependencies); +} + +cl::sycl::event swap(char *libname, cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].cswap_usm_sycl(queue, n, x, incx, y, incy, dependencies); +} + +cl::sycl::event swap(char *libname, cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zswap_usm_sycl(queue, n, x, incx, y, incy, dependencies); +} + +cl::sycl::event gbmv(char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, const float *a, + std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].sgbmv_usm_sycl(queue, trans, m, n, kl, ku, alpha, a, lda, x, + incx, beta, y, incy, dependencies); +} + +cl::sycl::event gbmv(char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, + const double *a, std::int64_t lda, const double *x, std::int64_t incx, + double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dgbmv_usm_sycl(queue, trans, m, n, kl, ku, alpha, a, lda, x, + incx, beta, y, incy, dependencies); +} + +cl::sycl::event gbmv(char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *x, + std::int64_t incx, std::complex beta, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].cgbmv_usm_sycl(queue, trans, m, n, kl, ku, alpha, a, lda, x, + incx, beta, y, incy, dependencies); +} + +cl::sycl::event gbmv(char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *x, + std::int64_t incx, std::complex beta, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zgbmv_usm_sycl(queue, trans, m, n, kl, ku, alpha, a, lda, x, + incx, beta, y, incy, dependencies); +} + +cl::sycl::event gemv(char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m, + std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *x, + std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].sgemv_usm_sycl(queue, trans, m, n, alpha, a, lda, x, incx, beta, + y, incy, dependencies); +} + +cl::sycl::event gemv(char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m, + std::int64_t n, double alpha, const double *a, std::int64_t lda, + const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dgemv_usm_sycl(queue, trans, m, n, alpha, a, lda, x, incx, beta, + y, incy, dependencies); +} + +cl::sycl::event gemv(char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].cgemv_usm_sycl(queue, trans, m, n, alpha, a, lda, x, incx, beta, + y, incy, dependencies); +} + +cl::sycl::event gemv(char *libname, cl::sycl::queue &queue, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zgemv_usm_sycl(queue, trans, m, n, alpha, a, lda, x, incx, beta, + y, incy, dependencies); +} + +cl::sycl::event ger(char *libname, cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + float alpha, const float *x, std::int64_t incx, const float *y, + std::int64_t incy, float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].sger_usm_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda, + dependencies); +} + +cl::sycl::event ger(char *libname, cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + double alpha, const double *x, std::int64_t incx, const double *y, + std::int64_t incy, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dger_usm_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda, + dependencies); +} + +cl::sycl::event gerc(char *libname, cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].cgerc_usm_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda, + dependencies); +} + +cl::sycl::event gerc(char *libname, cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zgerc_usm_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda, + dependencies); +} + +cl::sycl::event geru(char *libname, cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].cgeru_usm_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda, + dependencies); +} + +cl::sycl::event geru(char *libname, cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zgeru_usm_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda, + dependencies); +} + +cl::sycl::event hbmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].chbmv_usm_sycl(queue, upper_lower, n, k, alpha, a, lda, x, incx, + beta, y, incy, dependencies); +} + +cl::sycl::event hbmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zhbmv_usm_sycl(queue, upper_lower, n, k, alpha, a, lda, x, incx, + beta, y, incy, dependencies); +} + +cl::sycl::event hemv(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].chemv_usm_sycl(queue, upper_lower, n, alpha, a, lda, x, incx, + beta, y, incy, dependencies); +} + +cl::sycl::event hemv(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zhemv_usm_sycl(queue, upper_lower, n, alpha, a, lda, x, incx, + beta, y, incy, dependencies); +} + +cl::sycl::event her(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + float alpha, const std::complex *x, std::int64_t incx, + std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].cher_usm_sycl(queue, upper_lower, n, alpha, x, incx, a, lda, + dependencies); +} + +cl::sycl::event her(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + double alpha, const std::complex *x, std::int64_t incx, + std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zher_usm_sycl(queue, upper_lower, n, alpha, x, incx, a, lda, + dependencies); +} + +cl::sycl::event her2(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].cher2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, + a, lda, dependencies); +} + +cl::sycl::event her2(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zher2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, + a, lda, dependencies); +} + +cl::sycl::event hpmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *a, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].chpmv_usm_sycl(queue, upper_lower, n, alpha, a, x, incx, beta, + y, incy, dependencies); +} + +cl::sycl::event hpmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *a, + const std::complex *x, std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zhpmv_usm_sycl(queue, upper_lower, n, alpha, a, x, incx, beta, + y, incy, dependencies); +} + +cl::sycl::event hpr(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + float alpha, const std::complex *x, std::int64_t incx, + std::complex *a, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].chpr_usm_sycl(queue, upper_lower, n, alpha, x, incx, a, + dependencies); +} + +cl::sycl::event hpr(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + double alpha, const std::complex *x, std::int64_t incx, + std::complex *a, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zhpr_usm_sycl(queue, upper_lower, n, alpha, x, incx, a, + dependencies); +} + +cl::sycl::event hpr2(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].chpr2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, + a, dependencies); +} + +cl::sycl::event hpr2(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, std::complex *a, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zhpr2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, + a, dependencies); +} + +cl::sycl::event sbmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *x, + std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].ssbmv_usm_sycl(queue, upper_lower, n, k, alpha, a, lda, x, incx, + beta, y, incy, dependencies); +} + +cl::sycl::event sbmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + std::int64_t k, double alpha, const double *a, std::int64_t lda, + const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dsbmv_usm_sycl(queue, upper_lower, n, k, alpha, a, lda, x, incx, + beta, y, incy, dependencies); +} + +cl::sycl::event spmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + float alpha, const float *a, const float *x, std::int64_t incx, float beta, + float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].sspmv_usm_sycl(queue, upper_lower, n, alpha, a, x, incx, beta, + y, incy, dependencies); +} + +cl::sycl::event spmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + double alpha, const double *a, const double *x, std::int64_t incx, double beta, + double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dspmv_usm_sycl(queue, upper_lower, n, alpha, a, x, incx, beta, + y, incy, dependencies); +} + +cl::sycl::event spr(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + float alpha, const float *x, std::int64_t incx, float *a, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].sspr_usm_sycl(queue, upper_lower, n, alpha, x, incx, a, + dependencies); +} + +cl::sycl::event spr(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + double alpha, const double *x, std::int64_t incx, double *a, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dspr_usm_sycl(queue, upper_lower, n, alpha, x, incx, a, + dependencies); +} + +cl::sycl::event spr2(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + float alpha, const float *x, std::int64_t incx, const float *y, + std::int64_t incy, float *a, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].sspr2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, + a, dependencies); +} + +cl::sycl::event spr2(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + double alpha, const double *x, std::int64_t incx, const double *y, + std::int64_t incy, double *a, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dspr2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, + a, dependencies); +} + +cl::sycl::event symv(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + float alpha, const float *a, std::int64_t lda, const float *x, + std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].ssymv_usm_sycl(queue, upper_lower, n, alpha, a, lda, x, incx, + beta, y, incy, dependencies); +} + +cl::sycl::event symv(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + double alpha, const double *a, std::int64_t lda, const double *x, + std::int64_t incx, double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dsymv_usm_sycl(queue, upper_lower, n, alpha, a, lda, x, incx, + beta, y, incy, dependencies); +} + +cl::sycl::event syr(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + float alpha, const float *x, std::int64_t incx, float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].ssyr_usm_sycl(queue, upper_lower, n, alpha, x, incx, a, lda, + dependencies); +} + +cl::sycl::event syr(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + double alpha, const double *x, std::int64_t incx, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dsyr_usm_sycl(queue, upper_lower, n, alpha, x, incx, a, lda, + dependencies); +} + +cl::sycl::event syr2(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + float alpha, const float *x, std::int64_t incx, const float *y, + std::int64_t incy, float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].ssyr2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, + a, lda, dependencies); +} + +cl::sycl::event syr2(char *libname, cl::sycl::queue &queue, uplo upper_lower, std::int64_t n, + double alpha, const double *x, std::int64_t incx, const double *y, + std::int64_t incy, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dsyr2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, + a, lda, dependencies); +} + +cl::sycl::event tbmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const float *a, + std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].stbmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a, + lda, x, incx, dependencies); +} + +cl::sycl::event tbmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const double *a, + std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dtbmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a, + lda, x, incx, dependencies); +} + +cl::sycl::event tbmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].ctbmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a, + lda, x, incx, dependencies); +} + +cl::sycl::event tbmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].ztbmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a, + lda, x, incx, dependencies); +} + +cl::sycl::event tbsv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const float *a, + std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].stbsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a, + lda, x, incx, dependencies); +} + +cl::sycl::event tbsv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const double *a, + std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dtbsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a, + lda, x, incx, dependencies); +} + +cl::sycl::event tbsv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].ctbsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a, + lda, x, incx, dependencies); +} + +cl::sycl::event tbsv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].ztbsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a, + lda, x, incx, dependencies); +} + +cl::sycl::event tpmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].stpmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x, + incx, dependencies); +} + +cl::sycl::event tpmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dtpmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x, + incx, dependencies); +} + +cl::sycl::event tpmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex *a, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].ctpmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x, + incx, dependencies); +} + +cl::sycl::event tpmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex *a, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].ztpmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x, + incx, dependencies); +} + +cl::sycl::event tpsv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].stpsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x, + incx, dependencies); +} + +cl::sycl::event tpsv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dtpsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x, + incx, dependencies); +} + +cl::sycl::event tpsv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex *a, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].ctpsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x, + incx, dependencies); +} + +cl::sycl::event tpsv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex *a, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].ztpsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x, + incx, dependencies); +} + +cl::sycl::event trmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].strmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, + x, incx, dependencies); +} + +cl::sycl::event trmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dtrmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, + x, incx, dependencies); +} + +cl::sycl::event trmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex *a, std::int64_t lda, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].ctrmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, + x, incx, dependencies); +} + +cl::sycl::event trmv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].ztrmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, + x, incx, dependencies); +} + +cl::sycl::event trsv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].strsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, + x, incx, dependencies); +} + +cl::sycl::event trsv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dtrsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, + x, incx, dependencies); +} + +cl::sycl::event trsv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex *a, std::int64_t lda, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].ctrsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, + x, incx, dependencies); +} + +cl::sycl::event trsv(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].ztrsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, + x, incx, dependencies); +} + +cl::sycl::event gemm(char *libname, cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float *a, + std::int64_t lda, const float *b, std::int64_t ldb, float beta, float *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].sgemm_usm_sycl(queue, transa, transb, m, n, k, alpha, a, lda, b, + ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event gemm(char *libname, cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double *a, + std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dgemm_usm_sycl(queue, transa, transb, m, n, k, alpha, a, lda, b, + ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event gemm(char *libname, cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].cgemm_usm_sycl(queue, transa, transb, m, n, k, alpha, a, lda, b, + ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event gemm(char *libname, cl::sycl::queue &queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zgemm_usm_sycl(queue, transa, transb, m, n, k, alpha, a, lda, b, + ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event hemm(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].chemm_usm_sycl(queue, left_right, upper_lower, m, n, alpha, a, + lda, b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event hemm(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zhemm_usm_sycl(queue, left_right, upper_lower, m, n, alpha, a, + lda, b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event herk(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, const std::complex *a, + std::int64_t lda, float beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].cherk_usm_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, + beta, c, ldc, dependencies); +} + +cl::sycl::event herk(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, const std::complex *a, + std::int64_t lda, double beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zherk_usm_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, + beta, c, ldc, dependencies); +} + +cl::sycl::event her2k(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, float beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].cher2k_usm_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, + b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event her2k(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, double beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zher2k_usm_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, + b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event symm(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, + const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].ssymm_usm_sycl(queue, left_right, upper_lower, m, n, alpha, a, + lda, b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event symm(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, double alpha, const double *a, + std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dsymm_usm_sycl(queue, left_right, upper_lower, m, n, alpha, a, + lda, b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event symm(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].csymm_usm_sycl(queue, left_right, upper_lower, m, n, alpha, a, + lda, b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event symm(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zsymm_usm_sycl(queue, left_right, upper_lower, m, n, alpha, a, + lda, b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event syrk(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, + float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].ssyrk_usm_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, + beta, c, ldc, dependencies); +} + +cl::sycl::event syrk(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, const double *a, + std::int64_t lda, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dsyrk_usm_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, + beta, c, ldc, dependencies); +} + +cl::sycl::event syrk(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].csyrk_usm_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, + beta, c, ldc, dependencies); +} + +cl::sycl::event syrk(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zsyrk_usm_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, + beta, c, ldc, dependencies); +} + +cl::sycl::event syr2k(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, + const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].ssyr2k_usm_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, + b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event syr2k(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, const double *a, + std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dsyr2k_usm_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, + b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event syr2k(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].csyr2k_usm_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, + b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event syr2k(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zsyr2k_usm_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, + b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event trmm(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, + const float *a, std::int64_t lda, float *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].strmm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag, + m, n, alpha, a, lda, b, ldb, dependencies); +} + +cl::sycl::event trmm(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, + const double *a, std::int64_t lda, double *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dtrmm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag, + m, n, alpha, a, lda, b, ldb, dependencies); +} + +cl::sycl::event trmm(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].ctrmm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag, + m, n, alpha, a, lda, b, ldb, dependencies); +} + +cl::sycl::event trmm(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].ztrmm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag, + m, n, alpha, a, lda, b, ldb, dependencies); +} + +cl::sycl::event trsm(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, + const float *a, std::int64_t lda, float *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].strsm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag, + m, n, alpha, a, lda, b, ldb, dependencies); +} + +cl::sycl::event trsm(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, + const double *a, std::int64_t lda, double *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dtrsm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag, + m, n, alpha, a, lda, b, ldb, dependencies); +} + +cl::sycl::event trsm(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].ctrsm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag, + m, n, alpha, a, lda, b, ldb, dependencies); +} + +cl::sycl::event trsm(char *libname, cl::sycl::queue &queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].ztrsm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag, + m, n, alpha, a, lda, b, ldb, dependencies); +} + +cl::sycl::event gemm_batch(char *libname, cl::sycl::queue &queue, transpose *transa, + transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, + float *alpha, const float **a, std::int64_t *lda, const float **b, + std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].sgemm_batch_group_usm_sycl( + queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, + group_size, dependencies); +} + +cl::sycl::event gemm_batch(char *libname, cl::sycl::queue &queue, transpose *transa, + transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, + double *alpha, const double **a, std::int64_t *lda, const double **b, + std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dgemm_batch_group_usm_sycl( + queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, + group_size, dependencies); +} + +cl::sycl::event gemm_batch(char *libname, cl::sycl::queue &queue, transpose *transa, + transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, + std::complex *alpha, const std::complex **a, + std::int64_t *lda, const std::complex **b, std::int64_t *ldb, + std::complex *beta, std::complex **c, std::int64_t *ldc, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].cgemm_batch_group_usm_sycl( + queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, + group_size, dependencies); +} + +cl::sycl::event gemm_batch(char *libname, cl::sycl::queue &queue, transpose *transa, + transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, + std::complex *alpha, const std::complex **a, + std::int64_t *lda, const std::complex **b, std::int64_t *ldb, + std::complex *beta, std::complex **c, std::int64_t *ldc, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zgemm_batch_group_usm_sycl( + queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, + group_size, dependencies); +} + +cl::sycl::event gemm_batch(char *libname, cl::sycl::queue &queue, transpose transa, + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + float alpha, const float *a, std::int64_t lda, std::int64_t stride_a, + const float *b, std::int64_t ldb, std::int64_t stride_b, float beta, + float *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].sgemm_batch_strided_usm_sycl( + queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, + stride_c, batch_size, dependencies); +} + +cl::sycl::event gemm_batch(char *libname, cl::sycl::queue &queue, transpose transa, + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + double alpha, const double *a, std::int64_t lda, std::int64_t stride_a, + const double *b, std::int64_t ldb, std::int64_t stride_b, double beta, + double *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dgemm_batch_strided_usm_sycl( + queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, + stride_c, batch_size, dependencies); +} + +cl::sycl::event gemm_batch(char *libname, cl::sycl::queue &queue, transpose transa, + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, + std::int64_t lda, std::int64_t stride_a, const std::complex *b, + std::int64_t ldb, std::int64_t stride_b, std::complex beta, + std::complex *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].cgemm_batch_strided_usm_sycl( + queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, + stride_c, batch_size, dependencies); +} + +cl::sycl::event gemm_batch(char *libname, cl::sycl::queue &queue, transpose transa, + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, + std::int64_t lda, std::int64_t stride_a, const std::complex *b, + std::int64_t ldb, std::int64_t stride_b, std::complex beta, + std::complex *c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zgemm_batch_strided_usm_sycl( + queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, + stride_c, batch_size, dependencies); +} + +cl::sycl::event gemmt(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, float alpha, const float *a, + std::int64_t lda, const float *b, std::int64_t ldb, float beta, float *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].sgemmt_usm_sycl(queue, upper_lower, transa, transb, n, k, alpha, + a, lda, b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event gemmt(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, double alpha, + const double *a, std::int64_t lda, const double *b, std::int64_t ldb, + double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].dgemmt_usm_sycl(queue, upper_lower, transa, transb, n, k, alpha, + a, lda, b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event gemmt(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].cgemmt_usm_sycl(queue, upper_lower, transa, transb, n, k, alpha, + a, lda, b, ldb, beta, c, ldc, dependencies); +} + +cl::sycl::event gemmt(char *libname, cl::sycl::queue &queue, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies) { + return function_tables[libname].zgemmt_usm_sycl(queue, upper_lower, transa, transb, n, k, alpha, + a, lda, b, ldb, beta, c, ldc, dependencies); +} + } /*namespace detail */ } /* namespace blas */ } /* namespace onemkl */ diff --git a/src/blas/function_table.hpp b/src/blas/function_table.hpp index 69d9128b7..b6fa9ad66 100644 --- a/src/blas/function_table.hpp +++ b/src/blas/function_table.hpp @@ -27,6 +27,9 @@ typedef struct { int version; + + // Buffer APIs + void (*scasum_sycl)(cl::sycl::queue &queue, std::int64_t n, cl::sycl::buffer, 1> &x, std::int64_t incx, cl::sycl::buffer &result); @@ -601,44 +604,6 @@ typedef struct { std::int64_t n, std::complex alpha, cl::sycl::buffer, 1> &a, std::int64_t lda, cl::sycl::buffer, 1> &b, std::int64_t ldb); - void (*sgemm_batch_group_sycl)( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, cl::sycl::buffer &beta, - cl::sycl::buffer &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size); - void (*dgemm_batch_group_sycl)( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, cl::sycl::buffer &beta, - cl::sycl::buffer &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size); - void (*cgemm_batch_group_sycl)( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - cl::sycl::buffer, 1> &beta, cl::sycl::buffer, 1> &c, - cl::sycl::buffer &ldc, std::int64_t group_count, - cl::sycl::buffer &group_size); - void (*zgemm_batch_group_sycl)( - cl::sycl::queue &queue, cl::sycl::buffer &transa, - cl::sycl::buffer &transb, cl::sycl::buffer &m, - cl::sycl::buffer &n, cl::sycl::buffer &k, - cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - cl::sycl::buffer, 1> &beta, - cl::sycl::buffer, 1> &c, cl::sycl::buffer &ldc, - std::int64_t group_count, cl::sycl::buffer &group_size); void (*sgemm_batch_strided_sycl)(cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, cl::sycl::buffer &a, @@ -673,42 +638,6 @@ typedef struct { std::int64_t stride_b, std::complex beta, cl::sycl::buffer, 1> &c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); - void (*strsm_batch_group_sycl)( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, - cl::sycl::buffer &trans, cl::sycl::buffer &unit_diag, - cl::sycl::buffer &m, cl::sycl::buffer &n, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, std::int64_t group_count, - cl::sycl::buffer &group_size); - void (*dtrsm_batch_group_sycl)( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, - cl::sycl::buffer &trans, cl::sycl::buffer &unit_diag, - cl::sycl::buffer &m, cl::sycl::buffer &n, - cl::sycl::buffer &alpha, cl::sycl::buffer &a, - cl::sycl::buffer &lda, cl::sycl::buffer &b, - cl::sycl::buffer &ldb, std::int64_t group_count, - cl::sycl::buffer &group_size); - void (*ctrsm_batch_group_sycl)( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, - cl::sycl::buffer &trans, cl::sycl::buffer &unit_diag, - cl::sycl::buffer &m, cl::sycl::buffer &n, - cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size); - void (*ztrsm_batch_group_sycl)( - cl::sycl::queue &queue, cl::sycl::buffer &left_right, - cl::sycl::buffer &upper_lower, - cl::sycl::buffer &trans, cl::sycl::buffer &unit_diag, - cl::sycl::buffer &m, cl::sycl::buffer &n, - cl::sycl::buffer, 1> &alpha, - cl::sycl::buffer, 1> &a, cl::sycl::buffer &lda, - cl::sycl::buffer, 1> &b, cl::sycl::buffer &ldb, - std::int64_t group_count, cl::sycl::buffer &group_size); void (*strsm_batch_strided_sycl)(cl::sycl::queue &queue, onemkl::side left_right, onemkl::uplo upper_lower, onemkl::transpose trans, onemkl::diag unit_diag, std::int64_t m, std::int64_t n, @@ -801,6 +730,839 @@ typedef struct { half alpha, cl::sycl::buffer &a, std::int64_t lda, cl::sycl::buffer &b, std::int64_t ldb, half beta, cl::sycl::buffer &c, std::int64_t ldc); + + // USM APIs + + cl::sycl::event (*scasum_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, + float *result, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dzasum_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, + double *result, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*sasum_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dasum_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, const double *x, + std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*saxpy_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, float alpha, + const float *x, std::int64_t incx, float *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*daxpy_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, double alpha, + const double *x, std::int64_t incx, double *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*caxpy_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, + std::complex alpha, const std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*zaxpy_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, + std::complex alpha, const std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies); + + cl::sycl::event (*saxpy_batch_group_usm_sycl)( + cl::sycl::queue &queue, std::int64_t *n, float *alpha, const float **x, std::int64_t *incx, + float **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies); + + cl::sycl::event (*daxpy_batch_group_usm_sycl)( + cl::sycl::queue &queue, std::int64_t *n, double *alpha, const double **x, + std::int64_t *incx, double **y, std::int64_t *incy, std::int64_t group_count, + std::int64_t *group_size, const cl::sycl::vector_class &dependencies); + + cl::sycl::event (*caxpy_batch_group_usm_sycl)( + cl::sycl::queue &queue, std::int64_t *n, std::complex *alpha, + const std::complex **x, std::int64_t *incx, std::complex **y, + std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies); + + cl::sycl::event (*zaxpy_batch_group_usm_sycl)( + cl::sycl::queue &queue, std::int64_t *n, std::complex *alpha, + const std::complex **x, std::int64_t *incx, std::complex **y, + std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies); + + cl::sycl::event (*scopy_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dcopy_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, const double *x, + std::int64_t incx, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*ccopy_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*zcopy_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*sdot_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, + float *result, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*ddot_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, const double *x, + std::int64_t incx, const double *y, std::int64_t incy, + double *result, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dsdot_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, + double *result, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*cdotc_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*zdotc_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*cdotu_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*zdotu_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, + std::complex *result, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*isamin_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*idamin_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, const double *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*icamin_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, + std::int64_t *result, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*izamin_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, + std::int64_t *result, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*isamax_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*idamax_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, const double *x, + std::int64_t incx, std::int64_t *result, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*icamax_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, + std::int64_t *result, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*izamax_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, + std::int64_t *result, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*snrm2_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, + float *result, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dnrm2_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, + const std::complex *x, std::int64_t incx, + double *result, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*scnrm2_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, const float *x, + std::int64_t incx, float *result, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dznrm2_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, const double *x, + std::int64_t incx, double *result, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*srot_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, std::complex *x, + std::int64_t incx, std::complex *y, std::int64_t incy, + float c, float s, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*drot_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, + std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, double c, double s, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*csrot_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, float *x, + std::int64_t incx, float *y, std::int64_t incy, float c, + float s, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*zdrot_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, double *x, + std::int64_t incx, double *y, std::int64_t incy, double c, + double s, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*srotg_usm_sycl)(cl::sycl::queue &queue, float *a, float *b, float *c, + float *s, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*drotg_usm_sycl)(cl::sycl::queue &queue, double *a, double *b, double *c, + double *s, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*crotg_usm_sycl)(cl::sycl::queue &queue, std::complex *a, + std::complex *b, float *c, std::complex *s, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*zrotg_usm_sycl)(cl::sycl::queue &queue, std::complex *a, + std::complex *b, double *c, std::complex *s, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*srotm_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, float *x, + std::int64_t incx, float *y, std::int64_t incy, float *param, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*drotm_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, double *x, + std::int64_t incx, double *y, std::int64_t incy, + double *param, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*srotmg_usm_sycl)(cl::sycl::queue &queue, float *d1, float *d2, float *x1, + float y1, float *param, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*drotmg_usm_sycl)(cl::sycl::queue &queue, double *d1, double *d2, double *x1, + double y1, double *param, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*sscal_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, float alpha, float *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dscal_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, double alpha, + double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*cscal_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, + std::complex alpha, std::complex *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*csscal_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, + std::complex alpha, std::complex *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*zscal_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, float alpha, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*zdscal_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, double alpha, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*sdsdot_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, float sb, + const float *x, std::int64_t incx, const float *y, + std::int64_t incy, float *result, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*sswap_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, float *x, + std::int64_t incx, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dswap_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, double *x, + std::int64_t incx, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*cswap_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, + std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*zswap_usm_sycl)(cl::sycl::queue &queue, std::int64_t n, + std::complex *x, std::int64_t incx, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*sgbmv_usm_sycl)(cl::sycl::queue &queue, onemkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, float alpha, const float *a, + std::int64_t lda, const float *x, std::int64_t incx, + float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dgbmv_usm_sycl)(cl::sycl::queue &queue, onemkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, double alpha, const double *a, + std::int64_t lda, const double *x, std::int64_t incx, + double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*cgbmv_usm_sycl)(cl::sycl::queue &queue, onemkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*zgbmv_usm_sycl)(cl::sycl::queue &queue, onemkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*sgemv_usm_sycl)(cl::sycl::queue &queue, onemkl::transpose trans, + std::int64_t m, std::int64_t n, float alpha, const float *a, + std::int64_t lda, const float *x, std::int64_t incx, + float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dgemv_usm_sycl)(cl::sycl::queue &queue, onemkl::transpose trans, + std::int64_t m, std::int64_t n, double alpha, const double *a, + std::int64_t lda, const double *x, std::int64_t incx, + double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*cgemv_usm_sycl)(cl::sycl::queue &queue, onemkl::transpose trans, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*zgemv_usm_sycl)(cl::sycl::queue &queue, onemkl::transpose trans, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*sger_usm_sycl)(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + float alpha, const float *x, std::int64_t incx, const float *y, + std::int64_t incy, float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dger_usm_sycl)(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + double alpha, const double *x, std::int64_t incx, + const double *y, std::int64_t incy, double *a, + std::int64_t lda, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*cgerc_usm_sycl)(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, + std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*zgerc_usm_sycl)(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, + std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*cgeru_usm_sycl)(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, + std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*zgeru_usm_sycl)(cl::sycl::queue &queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *x, + std::int64_t incx, const std::complex *y, + std::int64_t incy, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*chbmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*zhbmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*chemv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*zhemv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + std::int64_t n, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *x, std::int64_t incx, + std::complex beta, std::complex *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*cher_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + std::int64_t n, float alpha, const std::complex *x, + std::int64_t incx, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*zher_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + std::int64_t n, double alpha, const std::complex *x, + std::int64_t incx, std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*cher2_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, + std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*zher2_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, + std::complex *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*chpmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + std::int64_t n, std::complex alpha, + const std::complex *a, const std::complex *x, + std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*zhpmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + std::int64_t n, std::complex alpha, + const std::complex *a, const std::complex *x, + std::int64_t incx, std::complex beta, + std::complex *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*chpr_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + std::int64_t n, float alpha, const std::complex *x, + std::int64_t incx, std::complex *a, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*zhpr_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + std::int64_t n, double alpha, const std::complex *x, + std::int64_t incx, std::complex *a, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*chpr2_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, + std::complex *a, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*zhpr2_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + std::int64_t n, std::complex alpha, + const std::complex *x, std::int64_t incx, + const std::complex *y, std::int64_t incy, + std::complex *a, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*ssbmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + std::int64_t n, std::int64_t k, float alpha, const float *a, + std::int64_t lda, const float *x, std::int64_t incx, + float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dsbmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + std::int64_t n, std::int64_t k, double alpha, const double *a, + std::int64_t lda, const double *x, std::int64_t incx, + double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*sspmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + std::int64_t n, float alpha, const float *a, const float *x, + std::int64_t incx, float beta, float *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dspmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + std::int64_t n, double alpha, const double *a, + const double *x, std::int64_t incx, double beta, double *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*sspr_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + std::int64_t n, float alpha, const float *x, std::int64_t incx, + float *a, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dspr_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + std::int64_t n, double alpha, const double *x, + std::int64_t incx, double *a, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*sspr2_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + std::int64_t n, float alpha, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, + float *a, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dspr2_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + std::int64_t n, double alpha, const double *x, + std::int64_t incx, const double *y, std::int64_t incy, + double *a, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*ssymv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + std::int64_t n, float alpha, const float *a, std::int64_t lda, + const float *x, std::int64_t incx, float beta, float *y, + std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dsymv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + std::int64_t n, double alpha, const double *a, + std::int64_t lda, const double *x, std::int64_t incx, + double beta, double *y, std::int64_t incy, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*ssyr_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + std::int64_t n, float alpha, const float *x, std::int64_t incx, + float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dsyr_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + std::int64_t n, double alpha, const double *x, + std::int64_t incx, double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*ssyr2_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + std::int64_t n, float alpha, const float *x, + std::int64_t incx, const float *y, std::int64_t incy, + float *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dsyr2_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + std::int64_t n, double alpha, const double *x, + std::int64_t incx, const double *y, std::int64_t incy, + double *a, std::int64_t lda, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*stbmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, + std::int64_t n, std::int64_t k, const float *a, + std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dtbmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, + std::int64_t n, std::int64_t k, const double *a, + std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*ctbmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, + std::int64_t n, std::int64_t k, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*ztbmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, + std::int64_t n, std::int64_t k, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*stbsv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, + std::int64_t n, std::int64_t k, const float *a, + std::int64_t lda, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dtbsv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, + std::int64_t n, std::int64_t k, const double *a, + std::int64_t lda, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*ctbsv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, + std::int64_t n, std::int64_t k, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*ztbsv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, + std::int64_t n, std::int64_t k, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*stpmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, + std::int64_t n, const float *a, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dtpmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, + std::int64_t n, const double *a, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*ctpmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, + std::int64_t n, const std::complex *a, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*ztpmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, + std::int64_t n, const std::complex *a, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*stpsv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, + std::int64_t n, const float *a, float *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dtpsv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, + std::int64_t n, const double *a, double *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*ctpsv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, + std::int64_t n, const std::complex *a, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*ztpsv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, + std::int64_t n, const std::complex *a, + std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*strmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, + std::int64_t n, const float *a, std::int64_t lda, float *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dtrmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, + std::int64_t n, const double *a, std::int64_t lda, double *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*ctrmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, + std::int64_t n, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*ztrmv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, + std::int64_t n, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*strsv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, + std::int64_t n, const float *a, std::int64_t lda, float *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dtrsv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, + std::int64_t n, const double *a, std::int64_t lda, double *x, + std::int64_t incx, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*ctrsv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, + std::int64_t n, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*ztrsv_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, onemkl::diag unit_diag, + std::int64_t n, const std::complex *a, + std::int64_t lda, std::complex *x, std::int64_t incx, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*sgemm_usm_sycl)(cl::sycl::queue &queue, onemkl::transpose transa, + onemkl::transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const float *a, std::int64_t lda, + const float *b, std::int64_t ldb, float beta, float *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dgemm_usm_sycl)(cl::sycl::queue &queue, onemkl::transpose transa, + onemkl::transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, double alpha, const double *a, + std::int64_t lda, const double *b, std::int64_t ldb, + double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*cgemm_usm_sycl)(cl::sycl::queue &queue, onemkl::transpose transa, + onemkl::transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*zgemm_usm_sycl)(cl::sycl::queue &queue, onemkl::transpose transa, + onemkl::transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*chemm_usm_sycl)(cl::sycl::queue &queue, onemkl::side left_right, + onemkl::uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*zhemm_usm_sycl)(cl::sycl::queue &queue, onemkl::side left_right, + onemkl::uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*cherk_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, std::int64_t n, std::int64_t k, + float alpha, const std::complex *a, std::int64_t lda, + float beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*zherk_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, std::int64_t n, std::int64_t k, + double alpha, const std::complex *a, std::int64_t lda, + double beta, std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*cher2k_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *b, + std::int64_t ldb, float beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*zher2k_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *b, + std::int64_t ldb, double beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*ssymm_usm_sycl)(cl::sycl::queue &queue, onemkl::side left_right, + onemkl::uplo upper_lower, std::int64_t m, std::int64_t n, + float alpha, const float *a, std::int64_t lda, const float *b, + std::int64_t ldb, float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dsymm_usm_sycl)(cl::sycl::queue &queue, onemkl::side left_right, + onemkl::uplo upper_lower, std::int64_t m, std::int64_t n, + double alpha, const double *a, std::int64_t lda, + const double *b, std::int64_t ldb, double beta, double *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*csymm_usm_sycl)(cl::sycl::queue &queue, onemkl::side left_right, + onemkl::uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*zsymm_usm_sycl)(cl::sycl::queue &queue, onemkl::side left_right, + onemkl::uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*ssyrk_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, std::int64_t n, std::int64_t k, + float alpha, const float *a, std::int64_t lda, float beta, + float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dsyrk_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, std::int64_t n, std::int64_t k, + double alpha, const double *a, std::int64_t lda, double beta, + double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*csyrk_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*zsyrk_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*ssyr2k_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, std::int64_t n, std::int64_t k, + float alpha, const float *a, std::int64_t lda, + const float *b, std::int64_t ldb, float beta, float *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dsyr2k_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, std::int64_t n, std::int64_t k, + double alpha, const double *a, std::int64_t lda, + const double *b, std::int64_t ldb, double beta, double *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*csyr2k_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*zsyr2k_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex *a, + std::int64_t lda, const std::complex *b, + std::int64_t ldb, std::complex beta, + std::complex *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*strmm_usm_sycl)(cl::sycl::queue &queue, onemkl::side left_right, + onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t m, std::int64_t n, + float alpha, const float *a, std::int64_t lda, float *b, + std::int64_t ldb, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dtrmm_usm_sycl)(cl::sycl::queue &queue, onemkl::side left_right, + onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t m, std::int64_t n, + double alpha, const double *a, std::int64_t lda, double *b, + std::int64_t ldb, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*ctrmm_usm_sycl)(cl::sycl::queue &queue, onemkl::side left_right, + onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*ztrmm_usm_sycl)(cl::sycl::queue &queue, onemkl::side left_right, + onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*strsm_usm_sycl)(cl::sycl::queue &queue, onemkl::side left_right, + onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t m, std::int64_t n, + float alpha, const float *a, std::int64_t lda, float *b, + std::int64_t ldb, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dtrsm_usm_sycl)(cl::sycl::queue &queue, onemkl::side left_right, + onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t m, std::int64_t n, + double alpha, const double *a, std::int64_t lda, double *b, + std::int64_t ldb, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*ctrsm_usm_sycl)(cl::sycl::queue &queue, onemkl::side left_right, + onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*ztrsm_usm_sycl)(cl::sycl::queue &queue, onemkl::side left_right, + onemkl::uplo upper_lower, onemkl::transpose trans, + onemkl::diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex *a, + std::int64_t lda, std::complex *b, std::int64_t ldb, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*sgemm_batch_group_usm_sycl)( + cl::sycl::queue &queue, onemkl::transpose *transa, onemkl::transpose *transb, + std::int64_t *m, std::int64_t *n, std::int64_t *k, float *alpha, const float **a, + std::int64_t *lda, const float **b, std::int64_t *ldb, float *beta, float **c, + std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dgemm_batch_group_usm_sycl)( + cl::sycl::queue &queue, onemkl::transpose *transa, onemkl::transpose *transb, + std::int64_t *m, std::int64_t *n, std::int64_t *k, double *alpha, const double **a, + std::int64_t *lda, const double **b, std::int64_t *ldb, double *beta, double **c, + std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*cgemm_batch_group_usm_sycl)( + cl::sycl::queue &queue, onemkl::transpose *transa, onemkl::transpose *transb, + std::int64_t *m, std::int64_t *n, std::int64_t *k, std::complex *alpha, + const std::complex **a, std::int64_t *lda, const std::complex **b, + std::int64_t *ldb, std::complex *beta, std::complex **c, std::int64_t *ldc, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*zgemm_batch_group_usm_sycl)( + cl::sycl::queue &queue, onemkl::transpose *transa, onemkl::transpose *transb, + std::int64_t *m, std::int64_t *n, std::int64_t *k, std::complex *alpha, + const std::complex **a, std::int64_t *lda, const std::complex **b, + std::int64_t *ldb, std::complex *beta, std::complex **c, std::int64_t *ldc, + std::int64_t group_count, std::int64_t *group_size, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*sgemm_batch_strided_usm_sycl)( + cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, + std::int64_t stride_a, const float *b, std::int64_t ldb, std::int64_t stride_b, float beta, + float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dgemm_batch_strided_usm_sycl)( + cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, double alpha, const double *a, std::int64_t lda, + std::int64_t stride_a, const double *b, std::int64_t ldb, std::int64_t stride_b, + double beta, double *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*cgemm_batch_strided_usm_sycl)( + cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex alpha, const std::complex *a, + std::int64_t lda, std::int64_t stride_a, const std::complex *b, std::int64_t ldb, + std::int64_t stride_b, std::complex beta, std::complex *c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*zgemm_batch_strided_usm_sycl)( + cl::sycl::queue &queue, onemkl::transpose transa, onemkl::transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex alpha, const std::complex *a, + std::int64_t lda, std::int64_t stride_a, const std::complex *b, std::int64_t ldb, + std::int64_t stride_b, std::complex beta, std::complex *c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*sgemmt_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose transa, onemkl::transpose transb, + std::int64_t n, std::int64_t k, float alpha, const float *a, + std::int64_t lda, const float *b, std::int64_t ldb, + float beta, float *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*dgemmt_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose transa, onemkl::transpose transb, + std::int64_t n, std::int64_t k, double alpha, + const double *a, std::int64_t lda, const double *b, + std::int64_t ldb, double beta, double *c, std::int64_t ldc, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*cgemmt_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose transa, onemkl::transpose transb, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies); + cl::sycl::event (*zgemmt_usm_sycl)(cl::sycl::queue &queue, onemkl::uplo upper_lower, + onemkl::transpose transa, onemkl::transpose transb, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex *a, std::int64_t lda, + const std::complex *b, std::int64_t ldb, + std::complex beta, std::complex *c, + std::int64_t ldc, + const cl::sycl::vector_class &dependencies); + } function_table_t; #endif //_BLAS_FUNCTION_TABLE_HPP_ diff --git a/src/include/exceptions_helper.hpp b/src/include/exceptions_helper.hpp new file mode 100644 index 000000000..80d1ddca9 --- /dev/null +++ b/src/include/exceptions_helper.hpp @@ -0,0 +1,34 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#ifndef __EXCEPTIONS_HELPER_HPP +#define __EXCEPTIONS_HELPER_HPP + +#include + +namespace onemkl { + +class backend_unsupported_exception : public std::runtime_error { +public: + backend_unsupported_exception() : std::runtime_error("Not yet supported for this backend") {} +}; + +} // namespace onemkl + +#endif // __EXCEPTIONS_HELPER_HPP diff --git a/tests/unit_tests/CMakeLists.txt b/tests/unit_tests/CMakeLists.txt index 4d364d1fa..1a3a00f0b 100644 --- a/tests/unit_tests/CMakeLists.txt +++ b/tests/unit_tests/CMakeLists.txt @@ -86,10 +86,12 @@ if(BUILD_SHARED_LIBS) gtest_discover_tests(test_main_rt PROPERTIES BUILD_RPATH ${CMAKE_BINARY_DIR}/lib PROPERTIES ENVIRONMENT LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/lib:$ENV{LD_LIBRARY_PATH} + DISCOVERY_TIMEOUT 30 ) endif() gtest_discover_tests(test_main_ct PROPERTIES BUILD_RPATH ${CMAKE_BINARY_DIR}/lib PROPERTIES ENVIRONMENT LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/lib:$ENV{LD_LIBRARY_PATH} + DISCOVERY_TIMEOUT 30 ) diff --git a/tests/unit_tests/blas/batch/CMakeLists.txt b/tests/unit_tests/blas/batch/CMakeLists.txt index 321cea698..f14c50b8a 100644 --- a/tests/unit_tests/blas/batch/CMakeLists.txt +++ b/tests/unit_tests/blas/batch/CMakeLists.txt @@ -18,7 +18,7 @@ #=============================================================================== # Build object from all test sources -set(BATCH_SOURCES "gemm_batch.cpp" "gemm_batch_stride.cpp" "trsm_batch.cpp" "trsm_batch_stride.cpp") +set(BATCH_SOURCES "gemm_batch_stride.cpp" "trsm_batch_stride.cpp" "gemm_batch_usm.cpp" "gemm_batch_stride_usm.cpp" "axpy_batch_usm.cpp") if(BUILD_SHARED_LIBS) add_library(blas_batch_rt OBJECT ${BATCH_SOURCES}) diff --git a/tests/unit_tests/blas/batch/axpy_batch_usm.cpp b/tests/unit_tests/blas/batch/axpy_batch_usm.cpp new file mode 100644 index 000000000..c5d6d5963 --- /dev/null +++ b/tests/unit_tests/blas/batch/axpy_batch_usm.cpp @@ -0,0 +1,239 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "allocator_helper.hpp" +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device &dev, int64_t group_count) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const &e) { + std::cout << "Caught asynchronous SYCL exception during AXPY_BATCH:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + int64_t *n = (int64_t *)onemkl::malloc_shared(64, sizeof(int64_t) * group_count, dev, cxt); + int64_t *incx = (int64_t *)onemkl::malloc_shared(64, sizeof(int64_t) * group_count, dev, cxt); + int64_t *incy = (int64_t *)onemkl::malloc_shared(64, sizeof(int64_t) * group_count, dev, cxt); + fp *alpha = (fp *)onemkl::malloc_shared(64, sizeof(fp) * group_count, dev, cxt); + int64_t *group_size = + (int64_t *)onemkl::malloc_shared(64, sizeof(int64_t) * group_count, dev, cxt); + + if ((n == NULL) || (incx == NULL) || (incy == NULL) || (alpha == NULL) || + (group_size == NULL)) { + std::cout << "Error cannot allocate input arrays\n"; + onemkl::free_shared(n, cxt); + onemkl::free_shared(incx, cxt); + onemkl::free_shared(incy, cxt); + onemkl::free_shared(alpha, cxt); + onemkl::free_shared(group_size, cxt); + return false; + } + + int64_t i; + int64_t j, idx = 0; + int64_t total_size_x, total_size_y; + int64_t total_batch_count = 0; + + for (i = 0; i < group_count; i++) { + group_size[i] = 1 + std::rand() % 100; + n[i] = 1 + std::rand() % 500; + incx[i] = ((std::rand() % 2) == 0) ? 1 + std::rand() % 2 : -1 - std::rand() % 2; + incy[i] = ((std::rand() % 2) == 0) ? 1 + std::rand() % 2 : -1 - std::rand() % 2; + alpha[i] = rand_scalar(); + total_batch_count += group_size[i]; + } + + fp **x_array = (fp **)onemkl::malloc_shared(64, sizeof(fp *) * total_batch_count, dev, cxt); + fp **y_array = (fp **)onemkl::malloc_shared(64, sizeof(fp *) * total_batch_count, dev, cxt); + fp **y_ref_array = (fp **)onemkl::malloc_shared(64, sizeof(fp *) * total_batch_count, dev, cxt); + + if ((x_array == NULL) || (y_array == NULL) || (y_ref_array == NULL)) { + std::cout << "Error cannot allocate arrays of pointers\n"; + onemkl::free_shared(x_array, cxt); + onemkl::free_shared(y_array, cxt); + onemkl::free_shared(y_ref_array, cxt); + return false; + } + idx = 0; + for (i = 0; i < group_count; i++) { + for (j = 0; j < group_size[i]; j++) { + total_size_x = (1 + (n[i] - 1) * std::abs(incx[i])); + total_size_y = (1 + (n[i] - 1) * std::abs(incy[i])); + x_array[idx] = (fp *)onemkl::malloc_shared(64, sizeof(fp) * total_size_x, dev, cxt); + y_array[idx] = (fp *)onemkl::malloc_shared(64, sizeof(fp) * total_size_y, dev, cxt); + y_ref_array[idx] = (fp *)onemkl::malloc_shared(64, sizeof(fp) * total_size_y, dev, cxt); + rand_vector(x_array[idx], n[i], incx[i]); + rand_vector(y_array[idx], n[i], incy[i]); + copy_vector(y_array[idx], n[i], incy[i], y_ref_array[idx]); + idx++; + } + } + + // Call reference AXPY_BATCH. + using fp_ref = typename ref_type_info::type; + int n_ref, incx_ref, incy_ref; + + idx = 0; + for (i = 0; i < group_count; i++) { + for (j = 0; j < group_size[i]; j++) { + n_ref = (int)n[i]; + incx_ref = (int)incx[i]; + incy_ref = (int)incy[i]; + ::axpy((const int *)&n_ref, (const fp_ref *)&alpha[i], (const fp_ref *)x_array[idx], + (const int *)&incx_ref, (fp_ref *)y_ref_array[idx], (const int *)&incy_ref); + idx++; + } + } + + // Call DPC++ AXPY_BATCH. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::axpy_batch(main_queue, n, alpha, (const fp **)x_array, incx, y_array, + incy, group_count, group_size, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::axpy_batch, + (main_queue, n, alpha, (const fp **)x_array, incx, y_array, incy, group_count, + group_size, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const &e) { + std::cout << "Caught synchronous SYCL exception during AXPY_BATCH:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception &e) { + idx = 0; + for (i = 0; i < group_count; i++) { + for (j = 0; j < group_size[i]; j++) { + onemkl::free_shared(x_array[idx], cxt); + onemkl::free_shared(y_array[idx], cxt); + onemkl::free_shared(y_ref_array[idx], cxt); + idx++; + } + } + onemkl::free_shared(n, cxt); + onemkl::free_shared(incx, cxt); + onemkl::free_shared(incy, cxt); + onemkl::free_shared(alpha, cxt); + onemkl::free_shared(group_size, cxt); + onemkl::free_shared(x_array, cxt); + onemkl::free_shared(y_array, cxt); + onemkl::free_shared(y_ref_array, cxt); + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of AXPY_BATCH:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + bool good = true; + idx = 0; + for (i = 0; i < group_count; i++) { + for (j = 0; j < group_size[i]; j++) { + good = good && check_equal_vector(y_array[idx], y_ref_array[idx], n[i], incy[i], n[i], + std::cout); + idx++; + } + } + + idx = 0; + for (i = 0; i < group_count; i++) { + for (j = 0; j < group_size[i]; j++) { + onemkl::free_shared(x_array[idx], cxt); + onemkl::free_shared(y_array[idx], cxt); + onemkl::free_shared(y_ref_array[idx], cxt); + idx++; + } + } + onemkl::free_shared(n, cxt); + onemkl::free_shared(incx, cxt); + onemkl::free_shared(incy, cxt); + onemkl::free_shared(alpha, cxt); + onemkl::free_shared(group_size, cxt); + onemkl::free_shared(x_array, cxt); + onemkl::free_shared(y_array, cxt); + onemkl::free_shared(y_ref_array, cxt); + return (int)good; +} + +class AxpyBatchUsmTests : public ::testing::TestWithParam {}; + +TEST_P(AxpyBatchUsmTests, RealSinglePrecision) { + EXPECT_TRUEORSKIP(test(GetParam(), 5)); +} + +TEST_P(AxpyBatchUsmTests, RealDoublePrecision) { + EXPECT_TRUEORSKIP(test(GetParam(), 5)); +} + +TEST_P(AxpyBatchUsmTests, ComplexSinglePrecision) { + EXPECT_TRUEORSKIP(test>(GetParam(), 5)); +} + +TEST_P(AxpyBatchUsmTests, ComplexDoublePrecision) { + EXPECT_TRUEORSKIP(test>(GetParam(), 5)); +} + +INSTANTIATE_TEST_SUITE_P(AxpyBatchUsmTestSuite, AxpyBatchUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/batch/gemm_batch.cpp b/tests/unit_tests/blas/batch/gemm_batch.cpp deleted file mode 100644 index 9dffd0634..000000000 --- a/tests/unit_tests/blas/batch/gemm_batch.cpp +++ /dev/null @@ -1,308 +0,0 @@ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, -* software distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions -* and limitations under the License. -* -* -* SPDX-License-Identifier: Apache-2.0 -*******************************************************************************/ - -#include -#include -#include -#include -#include -#include - -#include -#include "allocator_helper.hpp" -#include "cblas.h" -#include "onemkl/detail/config.hpp" -#include "onemkl/onemkl.hpp" -#include "onemkl_blas_helper.hpp" -#include "reference_blas_templates.hpp" -#include "test_common.hpp" -#include "test_helper.hpp" - -#include - -using namespace cl::sycl; -using std::vector; - -extern std::vector devices; - -namespace { - -template -bool test(const device &dev, int64_t group_count) { - // Prepare data. - int64_t *m = (int64_t *)onemkl::aligned_alloc(64, sizeof(int64_t) * group_count); - int64_t *n = (int64_t *)onemkl::aligned_alloc(64, sizeof(int64_t) * group_count); - int64_t *k = (int64_t *)onemkl::aligned_alloc(64, sizeof(int64_t) * group_count); - int64_t *lda = (int64_t *)onemkl::aligned_alloc(64, sizeof(int64_t) * group_count); - int64_t *ldb = (int64_t *)onemkl::aligned_alloc(64, sizeof(int64_t) * group_count); - int64_t *ldc = (int64_t *)onemkl::aligned_alloc(64, sizeof(int64_t) * group_count); - onemkl::transpose *transa = - (onemkl::transpose *)onemkl::aligned_alloc(64, sizeof(onemkl::transpose) * group_count); - onemkl::transpose *transb = - (onemkl::transpose *)onemkl::aligned_alloc(64, sizeof(onemkl::transpose) * group_count); - fp *alpha = (fp *)onemkl::aligned_alloc(64, sizeof(fp) * group_count); - fp *beta = (fp *)onemkl::aligned_alloc(64, sizeof(fp) * group_count); - int64_t *group_size = (int64_t *)onemkl::aligned_alloc(64, sizeof(int64_t) * group_count); - - if ((m == NULL) || (n == NULL) || (k == NULL) || (lda == NULL) || (ldb == NULL) || - (ldc == NULL) || (transa == NULL) || (transb == NULL) || (alpha == NULL) || - (beta == NULL) || (group_size == NULL)) { - std::cout << "Error cannot allocate input arrays\n"; - onemkl::aligned_free(m); - onemkl::aligned_free(n); - onemkl::aligned_free(k); - onemkl::aligned_free(lda); - onemkl::aligned_free(ldb); - onemkl::aligned_free(ldc); - onemkl::aligned_free(transa); - onemkl::aligned_free(transb); - onemkl::aligned_free(alpha); - onemkl::aligned_free(beta); - onemkl::aligned_free(group_size); - return false; - } - - int64_t i, tmp; - int64_t j, idx = 0, max_k = 0; - int64_t total_size_a = 0, total_size_b = 0, total_size_c = 0, total_batch_count = 0; - int64_t size_a = 0, size_b = 0, size_c = 0; - int64_t off_a = 0, off_b = 0, off_c = 0; - - for (i = 0; i < group_count; i++) { - group_size[i] = 1 + std::rand() % 20; - m[i] = 1 + std::rand() % 500; - n[i] = 1 + std::rand() % 500; - k[i] = 1 + std::rand() % 500; - lda[i] = std::max(m[i], k[i]); - ldb[i] = std::max(n[i], k[i]); - ldc[i] = std::max(m[i], n[i]); - alpha[i] = rand_scalar(); - beta[i] = rand_scalar(); - if ((std::is_same::value) || (std::is_same::value)) { - transa[i] = (onemkl::transpose)(std::rand() % 2); - transb[i] = (onemkl::transpose)(std::rand() % 2); - } - else { - tmp = std::rand() % 3; - if (tmp == 2) - transa[i] = onemkl::transpose::conjtrans; - else - transa[i] = (onemkl::transpose)tmp; - tmp = std::rand() % 3; - if (tmp == 2) - transb[i] = onemkl::transpose::conjtrans; - else - transb[i] = (onemkl::transpose)tmp; - } - total_size_a += - lda[i] * group_size[i] * ((transa[i] == onemkl::transpose::nontrans) ? k[i] : m[i]); - total_size_b += - ldb[i] * group_size[i] * ((transb[i] == onemkl::transpose::nontrans) ? n[i] : k[i]); - total_size_c += ldc[i] * n[i] * group_size[i]; - total_batch_count += group_size[i]; - } - - fp **a_array = (fp **)onemkl::aligned_alloc(64, sizeof(fp *) * total_batch_count); - fp **b_array = (fp **)onemkl::aligned_alloc(64, sizeof(fp *) * total_batch_count); - fp **c_array = (fp **)onemkl::aligned_alloc(64, sizeof(fp *) * total_batch_count); - fp **c_ref_array = (fp **)onemkl::aligned_alloc(64, sizeof(fp *) * total_batch_count); - - if ((a_array == NULL) || (b_array == NULL) || (c_array == NULL) || (c_ref_array == NULL)) { - std::cout << "Error cannot allocate arrays of pointers\n"; - onemkl::aligned_free(a_array); - onemkl::aligned_free(b_array); - onemkl::aligned_free(c_array); - onemkl::aligned_free(c_ref_array); - return false; - } - - vector> A(total_size_a), B(total_size_b), C(total_size_c), - C_ref(total_size_c); - - for (i = 0; i < group_count; i++) { - max_k = std::max(max_k, k[i]); - size_a = (transa[i] == onemkl::transpose::nontrans) ? k[i] * lda[i] : m[i] * lda[i]; - size_b = (transb[i] == onemkl::transpose::nontrans) ? n[i] * ldb[i] : k[i] * ldb[i]; - size_c = n[i] * ldc[i]; - for (j = 0; j < group_size[i]; j++) { - a_array[idx] = A.data() + off_a; - b_array[idx] = B.data() + off_b; - c_array[idx] = C.data() + off_c; - c_ref_array[idx] = C_ref.data() + off_c; - rand_matrix(a_array[idx], transa[i], m[i], k[i], lda[i]); - rand_matrix(b_array[idx], transb[i], k[i], n[i], ldb[i]); - rand_matrix(c_array[idx], onemkl::transpose::nontrans, m[i], n[i], ldc[i]); - off_a += size_a; - off_b += size_b; - off_c += size_c; - idx++; - } - } - C_ref = C; - - // Call reference GEMM_BATCH. - using fp_ref = typename ref_type_info::type; - int m_ref, n_ref, k_ref, lda_ref, ldb_ref, ldc_ref, group_size_ref; - CBLAS_TRANSPOSE transa_ref, transb_ref; - idx = 0; - for (i = 0; i < group_count; i++) { - m_ref = (int)m[i]; - n_ref = (int)n[i]; - k_ref = (int)k[i]; - lda_ref = (int)lda[i]; - ldb_ref = (int)ldb[i]; - ldc_ref = (int)ldc[i]; - group_size_ref = (int)group_size[i]; - transa_ref = convert_to_cblas_trans(transa[i]); - transb_ref = convert_to_cblas_trans(transb[i]); - for (j = 0; j < group_size_ref; j++) { - ::gemm(transa_ref, transb_ref, (const int *)&m_ref, (const int *)&n_ref, - (const int *)&k_ref, (const fp_ref *)&alpha[i], (const fp_ref *)a_array[idx], - (const int *)&lda_ref, (const fp_ref *)b_array[idx], (const int *)&ldb_ref, - (const fp_ref *)&beta[i], (fp_ref *)c_ref_array[idx], (const int *)&ldc_ref); - idx++; - } - } - - // Call DPC++ GEMM_BATCH. - - // Catch asynchronous exceptions. - auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { - try { - std::rethrow_exception(e); - } - catch (exception const &e) { - std::cout << "Caught asynchronous SYCL exception during GEMM_BATCH:\n" - << e.what() << std::endl - << "OpenCL status: " << e.get_cl_code() << std::endl; - } - } - }; - - queue main_queue(dev, exception_handler); - - buffer transa_buffer(transa, range<1>(group_count)); - buffer transb_buffer(transb, range<1>(group_count)); - buffer m_buffer(m, range<1>(group_count)); - buffer n_buffer(n, range<1>(group_count)); - buffer k_buffer(k, range<1>(group_count)); - buffer lda_buffer(lda, range<1>(group_count)); - buffer ldb_buffer(ldb, range<1>(group_count)); - buffer ldc_buffer(ldc, range<1>(group_count)); - buffer group_size_buffer(group_size, range<1>(group_count)); - buffer alpha_buffer(alpha, range<1>(group_count)); - buffer beta_buffer(beta, range<1>(group_count)); - buffer A_buffer(A.data(), range<1>(A.size())); - buffer B_buffer(B.data(), range<1>(B.size())); - buffer C_buffer(C.data(), range<1>(C.size())); - - try { -#ifdef CALL_RT_API - onemkl::blas::gemm_batch(main_queue, transa_buffer, transb_buffer, m_buffer, n_buffer, - k_buffer, alpha_buffer, A_buffer, lda_buffer, B_buffer, ldb_buffer, - beta_buffer, C_buffer, ldc_buffer, group_count, group_size_buffer); -#else - TEST_RUN_CT(main_queue, onemkl::blas::gemm_batch, - (main_queue, transa_buffer, transb_buffer, m_buffer, n_buffer, k_buffer, - alpha_buffer, A_buffer, lda_buffer, B_buffer, ldb_buffer, beta_buffer, - C_buffer, ldc_buffer, group_count, group_size_buffer)); -#endif - } - catch (exception const &e) { - std::cout << "Caught synchronous SYCL exception during GEMM_BATCH:\n" - << e.what() << std::endl - << "OpenCL status: " << e.get_cl_code() << std::endl; - } - - catch (const std::runtime_error &error) { - std::cout << "Error raised during execution of GEMM_BATCH:\n" << error.what() << std::endl; -#ifdef ENABLE_CUBLAS_BACKEND - // GEMM_BATCH currently not supported with CUBLAS backend. - std::string error_msg(error.what()); - if (error_msg.compare("Not implemented for cublas") == 0) { - onemkl::aligned_free(m); - onemkl::aligned_free(n); - onemkl::aligned_free(k); - onemkl::aligned_free(lda); - onemkl::aligned_free(ldb); - onemkl::aligned_free(ldc); - onemkl::aligned_free(transa); - onemkl::aligned_free(transb); - onemkl::aligned_free(alpha); - onemkl::aligned_free(beta); - onemkl::aligned_free(group_size); - onemkl::aligned_free(a_array); - onemkl::aligned_free(b_array); - onemkl::aligned_free(c_array); - onemkl::aligned_free(c_ref_array); - return true; - } -#endif - } - - // Compare the results of reference implementation and DPC++ implementation. - bool good; - { - auto C_accessor = C_buffer.template get_access(); - good = check_equal_matrix(C_accessor, C_ref, total_size_c, 1, total_size_c, 10 * max_k, - std::cout); - } - - onemkl::aligned_free(m); - onemkl::aligned_free(n); - onemkl::aligned_free(k); - onemkl::aligned_free(lda); - onemkl::aligned_free(ldb); - onemkl::aligned_free(ldc); - onemkl::aligned_free(transa); - onemkl::aligned_free(transb); - onemkl::aligned_free(alpha); - onemkl::aligned_free(beta); - onemkl::aligned_free(group_size); - onemkl::aligned_free(a_array); - onemkl::aligned_free(b_array); - onemkl::aligned_free(c_array); - onemkl::aligned_free(c_ref_array); - - return good; -} - -class GemmBatchTests : public ::testing::TestWithParam {}; - -TEST_P(GemmBatchTests, RealSinglePrecision) { - EXPECT_TRUE(test(GetParam(), 5)); -} - -TEST_P(GemmBatchTests, RealDoublePrecision) { - EXPECT_TRUE(test(GetParam(), 5)); -} - -TEST_P(GemmBatchTests, ComplexSinglePrecision) { - EXPECT_TRUE(test>(GetParam(), 5)); -} - -TEST_P(GemmBatchTests, ComplexDoublePrecision) { - EXPECT_TRUE(test>(GetParam(), 5)); -} - -INSTANTIATE_TEST_SUITE_P(GemmBatchTestSuite, GemmBatchTests, ::testing::ValuesIn(devices), - ::DeviceNamePrint()); - -} // anonymous namespace diff --git a/tests/unit_tests/blas/batch/gemm_batch_stride.cpp b/tests/unit_tests/blas/batch/gemm_batch_stride.cpp index a7f4e32d8..2e2bc14b1 100644 --- a/tests/unit_tests/blas/batch/gemm_batch_stride.cpp +++ b/tests/unit_tests/blas/batch/gemm_batch_stride.cpp @@ -44,7 +44,7 @@ extern std::vector devices; namespace { template -bool test(const device &dev, int64_t batch_size) { +int test(const device &dev, int64_t batch_size) { // Prepare data. int64_t m, n, k; int64_t lda, ldb, ldc; @@ -153,16 +153,13 @@ bool test(const device &dev, int64_t batch_size) { << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + catch (const std::runtime_error &error) { std::cout << "Error raised during execution of GEMM_BATCH_STRIDE:\n" << error.what() << std::endl; -#ifdef ENABLE_CUBLAS_BACKEND - // GEMM_BATCH_STRIDE currently not supported with CUBLAS backend - std::string error_msg(error.what()); - if (error_msg.compare("Not implemented for cublas") == 0) { - return true; - } -#endif } // Compare the results of reference implementation and DPC++ implementation. @@ -173,25 +170,25 @@ bool test(const device &dev, int64_t batch_size) { stride_c * batch_size, 10 * k, std::cout); } - return good; + return (int)good; } class GemmBatchStrideTests : public ::testing::TestWithParam {}; TEST_P(GemmBatchStrideTests, RealSinglePrecision) { - EXPECT_TRUE(test(GetParam(), 5)); + EXPECT_TRUEORSKIP(test(GetParam(), 5)); } TEST_P(GemmBatchStrideTests, RealDoublePrecision) { - EXPECT_TRUE(test(GetParam(), 5)); + EXPECT_TRUEORSKIP(test(GetParam(), 5)); } TEST_P(GemmBatchStrideTests, ComplexSinglePrecision) { - EXPECT_TRUE(test>(GetParam(), 5)); + EXPECT_TRUEORSKIP(test>(GetParam(), 5)); } TEST_P(GemmBatchStrideTests, ComplexDoublePrecision) { - EXPECT_TRUE(test>(GetParam(), 5)); + EXPECT_TRUEORSKIP(test>(GetParam(), 5)); } INSTANTIATE_TEST_SUITE_P(GemmBatchStrideTestSuite, GemmBatchStrideTests, diff --git a/tests/unit_tests/blas/batch/gemm_batch_stride_usm.cpp b/tests/unit_tests/blas/batch/gemm_batch_stride_usm.cpp new file mode 100644 index 000000000..602b883e1 --- /dev/null +++ b/tests/unit_tests/blas/batch/gemm_batch_stride_usm.cpp @@ -0,0 +1,228 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "allocator_helper.hpp" +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device &dev, int64_t batch_size) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const &e) { + std::cout << "Caught asynchronous SYCL exception during GEMM_BATCH_STRIDE:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + int64_t m, n, k; + int64_t lda, ldb, ldc; + onemkl::transpose transa, transb; + fp alpha, beta; + + int64_t i, tmp; + + batch_size = 1 + std::rand() % 20; + m = 1 + std::rand() % 500; + n = 1 + std::rand() % 500; + k = 1 + std::rand() % 500; + lda = std::max(m, k); + ldb = std::max(n, k); + ldc = std::max(m, n); + alpha = rand_scalar(); + beta = rand_scalar(); + if ((std::is_same::value) || (std::is_same::value)) { + transa = (onemkl::transpose)(std::rand() % 2); + transb = (onemkl::transpose)(std::rand() % 2); + } + else { + tmp = std::rand() % 3; + if (tmp == 2) + transa = onemkl::transpose::conjtrans; + else + transa = (onemkl::transpose)tmp; + tmp = std::rand() % 3; + if (tmp == 2) + transb = onemkl::transpose::conjtrans; + else + transb = (onemkl::transpose)tmp; + } + + int64_t stride_a, stride_b, stride_c; + + stride_a = (transa == onemkl::transpose::nontrans) ? lda * k : lda * m; + stride_b = (transb == onemkl::transpose::nontrans) ? ldb * n : ldb * k; + stride_c = ldc * n; + + auto ua = usm_allocator(cxt, dev); + vector A(ua), B(ua), C(ua), C_ref(ua); + + A.resize(stride_a * batch_size); + B.resize(stride_b * batch_size); + C.resize(stride_c * batch_size); + C_ref.resize(stride_c * batch_size); + + fp **a_array = (fp **)onemkl::malloc_shared(64, sizeof(fp *) * batch_size, dev, cxt); + fp **b_array = (fp **)onemkl::malloc_shared(64, sizeof(fp *) * batch_size, dev, cxt); + fp **c_array = (fp **)onemkl::malloc_shared(64, sizeof(fp *) * batch_size, dev, cxt); + fp **c_ref_array = (fp **)onemkl::malloc_shared(64, sizeof(fp *) * batch_size, dev, cxt); + + if ((a_array == NULL) || (b_array == NULL) || (c_array == NULL) || (c_ref_array == NULL)) { + std::cout << "Error cannot allocate arrays of pointers\n"; + onemkl::free_shared(a_array, cxt); + onemkl::free_shared(b_array, cxt); + onemkl::free_shared(c_array, cxt); + onemkl::free_shared(c_ref_array, cxt); + return false; + } + + for (i = 0; i < batch_size; i++) { + a_array[i] = &A[i * stride_a]; + b_array[i] = &B[i * stride_b]; + c_array[i] = &C[i * stride_c]; + c_ref_array[i] = &C_ref[i * stride_c]; + } + + rand_matrix(A, onemkl::transpose::nontrans, stride_a * batch_size, 1, stride_a * batch_size); + rand_matrix(B, onemkl::transpose::nontrans, stride_b * batch_size, 1, stride_b * batch_size); + rand_matrix(C, onemkl::transpose::nontrans, stride_c * batch_size, 1, stride_c * batch_size); + copy_matrix(C, onemkl::transpose::nontrans, stride_c * batch_size, 1, stride_c * batch_size, + C_ref); + + // Call reference GEMM_BATCH_STRIDE. + using fp_ref = typename ref_type_info::type; + int m_ref = (int)m; + int n_ref = (int)n; + int k_ref = (int)k; + int lda_ref = (int)lda; + int ldb_ref = (int)ldb; + int ldc_ref = (int)ldc; + int batch_size_ref = (int)batch_size; + for (i = 0; i < batch_size_ref; i++) { + ::gemm(convert_to_cblas_trans(transa), convert_to_cblas_trans(transb), (const int *)&m_ref, + (const int *)&n_ref, (const int *)&k_ref, (const fp_ref *)&alpha, + (const fp_ref *)a_array[i], (const int *)&lda_ref, (const fp_ref *)b_array[i], + (const int *)&ldb_ref, (const fp_ref *)&beta, (fp_ref *)c_ref_array[i], + (const int *)&ldc_ref); + } + + // Call DPC++ GEMM_BATCH_STRIDE. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::gemm_batch(main_queue, transa, transb, m, n, k, alpha, &A[0], lda, + stride_a, &B[0], ldb, stride_b, beta, &C[0], ldc, stride_c, + batch_size, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::gemm_batch, + (main_queue, transa, transb, m, n, k, alpha, &A[0], lda, stride_a, &B[0], ldb, + stride_b, beta, &C[0], ldc, stride_c, batch_size, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const &e) { + std::cout << "Caught synchronous SYCL exception during GEMM_BATCH_STRIDE:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception &e) { + onemkl::free_shared(a_array, cxt); + onemkl::free_shared(b_array, cxt); + onemkl::free_shared(c_array, cxt); + onemkl::free_shared(c_ref_array, cxt); + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of GEMM_BATCH_STRIDE:\n" + << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + bool good = true; + { + good = check_equal_matrix(C, C_ref, stride_c * batch_size, 1, stride_c * batch_size, 10 * k, + std::cout); + } + + onemkl::free_shared(a_array, cxt); + onemkl::free_shared(b_array, cxt); + onemkl::free_shared(c_array, cxt); + onemkl::free_shared(c_ref_array, cxt); + return (int)good; +} + +class GemmBatchStrideUsmTests : public ::testing::TestWithParam {}; + +TEST_P(GemmBatchStrideUsmTests, RealSinglePrecision) { + EXPECT_TRUEORSKIP(test(GetParam(), 5)); +} + +TEST_P(GemmBatchStrideUsmTests, RealDoublePrecision) { + EXPECT_TRUEORSKIP(test(GetParam(), 5)); +} + +TEST_P(GemmBatchStrideUsmTests, ComplexSinglePrecision) { + EXPECT_TRUEORSKIP(test>(GetParam(), 5)); +} + +TEST_P(GemmBatchStrideUsmTests, ComplexDoublePrecision) { + EXPECT_TRUEORSKIP(test>(GetParam(), 5)); +} + +INSTANTIATE_TEST_SUITE_P(GemmBatchStrideUsmTestSuite, GemmBatchStrideUsmTests, + ::testing::ValuesIn(devices), ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/batch/gemm_batch_usm.cpp b/tests/unit_tests/blas/batch/gemm_batch_usm.cpp new file mode 100644 index 000000000..6d9918798 --- /dev/null +++ b/tests/unit_tests/blas/batch/gemm_batch_usm.cpp @@ -0,0 +1,370 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "allocator_helper.hpp" +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device &dev, int64_t group_count) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const &e) { + std::cout << "Caught asynchronous SYCL exception during GEMM_BATCH:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + int64_t *m = (int64_t *)onemkl::malloc_shared(64, sizeof(int64_t) * group_count, dev, cxt); + int64_t *n = (int64_t *)onemkl::malloc_shared(64, sizeof(int64_t) * group_count, dev, cxt); + int64_t *k = (int64_t *)onemkl::malloc_shared(64, sizeof(int64_t) * group_count, dev, cxt); + int64_t *lda = (int64_t *)onemkl::malloc_shared(64, sizeof(int64_t) * group_count, dev, cxt); + int64_t *ldb = (int64_t *)onemkl::malloc_shared(64, sizeof(int64_t) * group_count, dev, cxt); + int64_t *ldc = (int64_t *)onemkl::malloc_shared(64, sizeof(int64_t) * group_count, dev, cxt); + onemkl::transpose *transa = (onemkl::transpose *)onemkl::malloc_shared( + 64, sizeof(onemkl::transpose) * group_count, dev, cxt); + onemkl::transpose *transb = (onemkl::transpose *)onemkl::malloc_shared( + 64, sizeof(onemkl::transpose) * group_count, dev, cxt); + fp *alpha = (fp *)onemkl::malloc_shared(64, sizeof(fp) * group_count, dev, cxt); + fp *beta = (fp *)onemkl::malloc_shared(64, sizeof(fp) * group_count, dev, cxt); + int64_t *group_size = + (int64_t *)onemkl::malloc_shared(64, sizeof(int64_t) * group_count, dev, cxt); + + if ((m == NULL) || (n == NULL) || (k == NULL) || (lda == NULL) || (ldb == NULL) || + (ldc == NULL) || (transa == NULL) || (transb == NULL) || (alpha == NULL) || + (beta == NULL) || (group_size == NULL)) { + std::cout << "Error cannot allocate input arrays\n"; + onemkl::free_shared(m, cxt); + onemkl::free_shared(n, cxt); + onemkl::free_shared(k, cxt); + onemkl::free_shared(lda, cxt); + onemkl::free_shared(ldb, cxt); + onemkl::free_shared(ldc, cxt); + onemkl::free_shared(transa, cxt); + onemkl::free_shared(transb, cxt); + onemkl::free_shared(alpha, cxt); + onemkl::free_shared(beta, cxt); + onemkl::free_shared(group_size, cxt); + return false; + } + + int64_t i, tmp; + int64_t j, idx = 0; + int64_t total_batch_count = 0; + + int64_t *total_size_a = (int64_t *)onemkl::aligned_alloc(64, sizeof(int64_t) * group_count); + int64_t *total_size_b = (int64_t *)onemkl::aligned_alloc(64, sizeof(int64_t) * group_count); + int64_t *total_size_c = (int64_t *)onemkl::aligned_alloc(64, sizeof(int64_t) * group_count); + if ((total_size_a == NULL) || (total_size_b == NULL) || (total_size_c == NULL)) { + std::cout << "Error cannot allocate input arrays\n"; + onemkl::aligned_free(total_size_a); + onemkl::aligned_free(total_size_b); + onemkl::aligned_free(total_size_c); + return false; + } + + for (i = 0; i < group_count; i++) { + group_size[i] = 1 + std::rand() % 20; + m[i] = 1 + std::rand() % 500; + n[i] = 1 + std::rand() % 500; + k[i] = 1 + std::rand() % 500; + lda[i] = std::max(m[i], k[i]); + ldb[i] = std::max(n[i], k[i]); + ldc[i] = std::max(m[i], n[i]); + alpha[i] = rand_scalar(); + beta[i] = rand_scalar(); + if ((std::is_same::value) || (std::is_same::value)) { + transa[i] = (onemkl::transpose)(std::rand() % 2); + transb[i] = (onemkl::transpose)(std::rand() % 2); + } + else { + tmp = std::rand() % 3; + if (tmp == 2) + transa[i] = onemkl::transpose::conjtrans; + else + transa[i] = (onemkl::transpose)tmp; + tmp = std::rand() % 3; + if (tmp == 2) + transb[i] = onemkl::transpose::conjtrans; + else + transb[i] = (onemkl::transpose)tmp; + } + total_size_a[i] = lda[i] * ((transa[i] == onemkl::transpose::nontrans) ? k[i] : m[i]); + total_size_b[i] = ldb[i] * ((transb[i] == onemkl::transpose::nontrans) ? n[i] : k[i]); + total_size_c[i] = ldc[i] * n[i]; + total_batch_count += group_size[i]; + } + + fp **a_array = (fp **)onemkl::malloc_shared(64, sizeof(fp *) * total_batch_count, dev, cxt); + fp **b_array = (fp **)onemkl::malloc_shared(64, sizeof(fp *) * total_batch_count, dev, cxt); + fp **c_array = (fp **)onemkl::malloc_shared(64, sizeof(fp *) * total_batch_count, dev, cxt); + fp **c_ref_array = (fp **)onemkl::malloc_shared(64, sizeof(fp *) * total_batch_count, dev, cxt); + + if ((a_array == NULL) || (b_array == NULL) || (c_array == NULL) || (c_ref_array == NULL)) { + std::cout << "Error cannot allocate arrays of pointers\n"; + onemkl::free_shared(a_array, cxt); + onemkl::free_shared(b_array, cxt); + onemkl::free_shared(c_array, cxt); + onemkl::free_shared(c_ref_array, cxt); + return false; + } + idx = 0; + for (i = 0; i < group_count; i++) { + for (j = 0; j < group_size[i]; j++) { + a_array[idx] = (fp *)onemkl::malloc_shared(64, sizeof(fp) * total_size_a[i], dev, cxt); + b_array[idx] = (fp *)onemkl::malloc_shared(64, sizeof(fp) * total_size_b[i], dev, cxt); + c_array[idx] = (fp *)onemkl::malloc_shared(64, sizeof(fp) * total_size_c[i], dev, cxt); + c_ref_array[idx] = + (fp *)onemkl::malloc_shared(64, sizeof(fp) * total_size_c[i], dev, cxt); + + rand_matrix(a_array[idx], transa[i], m[i], k[i], lda[i]); + rand_matrix(b_array[idx], transb[i], k[i], n[i], ldb[i]); + rand_matrix(c_array[idx], onemkl::transpose::nontrans, m[i], n[i], ldc[i]); + copy_matrix(c_array[idx], onemkl::transpose::nontrans, m[i], n[i], ldc[i], + c_ref_array[idx]); + idx++; + } + } + + // Call reference GEMM_BATCH. + using fp_ref = typename ref_type_info::type; + int *m_ref = (int *)onemkl::aligned_alloc(64, sizeof(int) * group_count); + int *n_ref = (int *)onemkl::aligned_alloc(64, sizeof(int) * group_count); + int *k_ref = (int *)onemkl::aligned_alloc(64, sizeof(int) * group_count); + int *lda_ref = (int *)onemkl::aligned_alloc(64, sizeof(int) * group_count); + int *ldb_ref = (int *)onemkl::aligned_alloc(64, sizeof(int) * group_count); + int *ldc_ref = (int *)onemkl::aligned_alloc(64, sizeof(int) * group_count); + int *group_size_ref = (int *)onemkl::aligned_alloc(64, sizeof(int) * group_count); + + CBLAS_TRANSPOSE *transa_ref = + (CBLAS_TRANSPOSE *)onemkl::aligned_alloc(64, sizeof(CBLAS_TRANSPOSE) * group_count); + CBLAS_TRANSPOSE *transb_ref = + (CBLAS_TRANSPOSE *)onemkl::aligned_alloc(64, sizeof(CBLAS_TRANSPOSE) * group_count); + + if ((m_ref == NULL) || (n_ref == NULL) || (k_ref == NULL) || (lda_ref == NULL) || + (ldb_ref == NULL) || (ldc_ref == NULL) || (transa_ref == NULL) || (transb_ref == NULL) || + (group_size_ref == NULL)) { + std::cout << "Error cannot allocate input arrays\n"; + onemkl::aligned_free(m_ref); + onemkl::aligned_free(n_ref); + onemkl::aligned_free(k_ref); + onemkl::aligned_free(lda_ref); + onemkl::aligned_free(ldb_ref); + onemkl::aligned_free(ldc_ref); + onemkl::aligned_free(transa_ref); + onemkl::aligned_free(transb_ref); + onemkl::aligned_free(group_size_ref); + return false; + } + idx = 0; + for (i = 0; i < group_count; i++) { + transa_ref[i] = convert_to_cblas_trans(transa[i]); + transb_ref[i] = convert_to_cblas_trans(transb[i]); + m_ref[i] = (int)m[i]; + n_ref[i] = (int)n[i]; + k_ref[i] = (int)k[i]; + lda_ref[i] = (int)lda[i]; + ldb_ref[i] = (int)ldb[i]; + ldc_ref[i] = (int)ldc[i]; + group_size_ref[i] = (int)group_size[i]; + for (j = 0; j < group_size_ref[i]; j++) { + ::gemm(transa_ref[i], transb_ref[i], (const int *)&m_ref[i], (const int *)&n_ref[i], + (const int *)&k_ref[i], (const fp_ref *)&alpha[i], (const fp_ref *)a_array[idx], + (const int *)&lda_ref[i], (const fp_ref *)b_array[idx], (const int *)&ldb_ref[i], + (const fp_ref *)&beta[i], (fp_ref *)c_ref_array[idx], (const int *)&ldc_ref[i]); + idx++; + } + } + + // Call DPC++ GEMM_BATCH. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::gemm_batch(main_queue, transa, transb, m, n, k, alpha, + (const fp **)a_array, lda, (const fp **)b_array, ldb, beta, + c_array, ldc, group_count, group_size, dependencies); + done.wait(); +#else + TEST_RUN_CT( + main_queue, onemkl::blas::gemm_batch, + (main_queue, transa, transb, m, n, k, alpha, (const fp **)a_array, lda, + (const fp **)b_array, ldb, beta, c_array, ldc, group_count, group_size, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const &e) { + std::cout << "Caught synchronous SYCL exception during GEMM_BATCH:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception &e) { + onemkl::aligned_free(total_size_a); + onemkl::aligned_free(total_size_b); + onemkl::aligned_free(total_size_c); + onemkl::aligned_free(m_ref); + onemkl::aligned_free(n_ref); + onemkl::aligned_free(k_ref); + onemkl::aligned_free(lda_ref); + onemkl::aligned_free(ldb_ref); + onemkl::aligned_free(ldc_ref); + onemkl::aligned_free(transa_ref); + onemkl::aligned_free(transb_ref); + onemkl::aligned_free(group_size_ref); + idx = 0; + for (i = 0; i < group_count; i++) { + for (j = 0; j < group_size[i]; j++) { + onemkl::free_shared(a_array[idx], cxt); + onemkl::free_shared(b_array[idx], cxt); + onemkl::free_shared(c_array[idx], cxt); + onemkl::free_shared(c_ref_array[idx], cxt); + idx++; + } + } + onemkl::free_shared(m, cxt); + onemkl::free_shared(n, cxt); + onemkl::free_shared(k, cxt); + onemkl::free_shared(lda, cxt); + onemkl::free_shared(ldb, cxt); + onemkl::free_shared(ldc, cxt); + onemkl::free_shared(transa, cxt); + onemkl::free_shared(transb, cxt); + onemkl::free_shared(alpha, cxt); + onemkl::free_shared(beta, cxt); + onemkl::free_shared(group_size, cxt); + onemkl::free_shared(a_array, cxt); + onemkl::free_shared(b_array, cxt); + onemkl::free_shared(c_array, cxt); + onemkl::free_shared(c_ref_array, cxt); + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of GEMM_BATCH:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + bool good = true; + { + idx = 0; + for (i = 0; i < group_count; i++) { + for (j = 0; j < group_size[i]; j++) { + good = good && check_equal_matrix(c_array[idx], c_ref_array[idx], m[i], n[i], + ldc[i], 10 * k[i], std::cout); + idx++; + } + } + } + + onemkl::aligned_free(total_size_a); + onemkl::aligned_free(total_size_b); + onemkl::aligned_free(total_size_c); + onemkl::aligned_free(m_ref); + onemkl::aligned_free(n_ref); + onemkl::aligned_free(k_ref); + onemkl::aligned_free(lda_ref); + onemkl::aligned_free(ldb_ref); + onemkl::aligned_free(ldc_ref); + onemkl::aligned_free(transa_ref); + onemkl::aligned_free(transb_ref); + onemkl::aligned_free(group_size_ref); + idx = 0; + for (i = 0; i < group_count; i++) { + for (j = 0; j < group_size[i]; j++) { + onemkl::free_shared(a_array[idx], cxt); + onemkl::free_shared(b_array[idx], cxt); + onemkl::free_shared(c_array[idx], cxt); + onemkl::free_shared(c_ref_array[idx], cxt); + idx++; + } + } + onemkl::free_shared(m, cxt); + onemkl::free_shared(n, cxt); + onemkl::free_shared(k, cxt); + onemkl::free_shared(lda, cxt); + onemkl::free_shared(ldb, cxt); + onemkl::free_shared(ldc, cxt); + onemkl::free_shared(transa, cxt); + onemkl::free_shared(transb, cxt); + onemkl::free_shared(alpha, cxt); + onemkl::free_shared(beta, cxt); + onemkl::free_shared(group_size, cxt); + onemkl::free_shared(a_array, cxt); + onemkl::free_shared(b_array, cxt); + onemkl::free_shared(c_array, cxt); + onemkl::free_shared(c_ref_array, cxt); + return (int)good; +} + +class GemmBatchUsmTests : public ::testing::TestWithParam {}; + +TEST_P(GemmBatchUsmTests, RealSinglePrecision) { + EXPECT_TRUEORSKIP(test(GetParam(), 5)); +} + +TEST_P(GemmBatchUsmTests, RealDoublePrecision) { + EXPECT_TRUEORSKIP(test(GetParam(), 5)); +} + +TEST_P(GemmBatchUsmTests, ComplexSinglePrecision) { + EXPECT_TRUEORSKIP(test>(GetParam(), 5)); +} + +TEST_P(GemmBatchUsmTests, ComplexDoublePrecision) { + EXPECT_TRUEORSKIP(test>(GetParam(), 5)); +} + +INSTANTIATE_TEST_SUITE_P(GemmBatchUsmTestSuite, GemmBatchUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/batch/trsm_batch.cpp b/tests/unit_tests/blas/batch/trsm_batch.cpp deleted file mode 100644 index 758303833..000000000 --- a/tests/unit_tests/blas/batch/trsm_batch.cpp +++ /dev/null @@ -1,297 +0,0 @@ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, -* software distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions -* and limitations under the License. -* -* -* SPDX-License-Identifier: Apache-2.0 -*******************************************************************************/ - -#include -#include -#include -#include -#include -#include - -#include -#include "allocator_helper.hpp" -#include "cblas.h" -#include "onemkl/detail/config.hpp" -#include "onemkl/onemkl.hpp" -#include "onemkl_blas_helper.hpp" -#include "reference_blas_templates.hpp" -#include "test_common.hpp" -#include "test_helper.hpp" - -#include - -using namespace cl::sycl; -using std::vector; - -extern std::vector devices; - -namespace { - -template -bool test(const device &dev, int64_t group_count) { - // Prepare data. - int64_t *m = (int64_t *)onemkl::aligned_alloc(64, sizeof(int64_t) * group_count); - int64_t *n = (int64_t *)onemkl::aligned_alloc(64, sizeof(int64_t) * group_count); - int64_t *lda = (int64_t *)onemkl::aligned_alloc(64, sizeof(int64_t) * group_count); - int64_t *ldb = (int64_t *)onemkl::aligned_alloc(64, sizeof(int64_t) * group_count); - onemkl::transpose *trans = - (onemkl::transpose *)onemkl::aligned_alloc(64, sizeof(onemkl::transpose) * group_count); - onemkl::side *left_right = - (onemkl::side *)onemkl::aligned_alloc(64, sizeof(onemkl::side) * group_count); - onemkl::uplo *upper_lower = - (onemkl::uplo *)onemkl::aligned_alloc(64, sizeof(onemkl::uplo) * group_count); - onemkl::diag *unit_diag = - (onemkl::diag *)onemkl::aligned_alloc(64, sizeof(onemkl::diag) * group_count); - fp *alpha = (fp *)onemkl::aligned_alloc(64, sizeof(fp) * group_count); - int64_t *group_size = (int64_t *)onemkl::aligned_alloc(64, sizeof(int64_t) * group_count); - - if ((m == NULL) || (n == NULL) || (lda == NULL) || (ldb == NULL) || (trans == NULL) || - (left_right == NULL) || (upper_lower == NULL) || (unit_diag == NULL) || (alpha == NULL) || - (group_size == NULL)) { - std::cout << "Error cannot allocate input arrays\n"; - onemkl::aligned_free(m); - onemkl::aligned_free(n); - onemkl::aligned_free(lda); - onemkl::aligned_free(ldb); - onemkl::aligned_free(trans); - onemkl::aligned_free(left_right); - onemkl::aligned_free(upper_lower); - onemkl::aligned_free(unit_diag); - onemkl::aligned_free(alpha); - onemkl::aligned_free(group_size); - return false; - } - - int64_t i, tmp; - int64_t j, idx = 0, max = 0; - int64_t total_size_a = 0, total_size_b = 0, total_batch_count = 0; - int64_t size_a = 0, size_b = 0; - int64_t off_a = 0, off_b = 0; - - for (i = 0; i < group_count; i++) { - group_size[i] = 1 + std::rand() % 20; - m[i] = 1 + std::rand() % 50; - n[i] = 1 + std::rand() % 50; - lda[i] = std::max(m[i], n[i]); - ldb[i] = std::max(n[i], m[i]); - alpha[i] = rand_scalar(); - if ((std::is_same::value) || (std::is_same::value)) { - trans[i] = (onemkl::transpose)(std::rand() % 2); - } - else { - tmp = std::rand() % 3; - if (tmp == 2) - trans[i] = onemkl::transpose::conjtrans; - else - trans[i] = (onemkl::transpose)tmp; - } - left_right[i] = (onemkl::side)(std::rand() % 2); - upper_lower[i] = (onemkl::uplo)(std::rand() % 2); - unit_diag[i] = (onemkl::diag)(std::rand() % 2); - } - - for (i = 0; i < group_count; i++) { - total_size_a += - lda[i] * group_size[i] * ((left_right[i] == onemkl::side::left) ? m[i] : n[i]); - total_size_b += ldb[i] * group_size[i] * n[i]; - total_batch_count += group_size[i]; - } - - fp **a_array = (fp **)onemkl::aligned_alloc(64, sizeof(fp *) * total_batch_count); - fp **b_array = (fp **)onemkl::aligned_alloc(64, sizeof(fp *) * total_batch_count); - fp **b_ref_array = (fp **)onemkl::aligned_alloc(64, sizeof(fp *) * total_batch_count); - - if ((a_array == NULL) || (b_array == NULL) || (b_ref_array == NULL)) { - std::cout << "Error cannot allocate arrays of pointers\n"; - onemkl::aligned_free(a_array); - onemkl::aligned_free(b_array); - onemkl::aligned_free(b_ref_array); - return false; - } - - vector> A(total_size_a), B(total_size_b), B_ref(total_size_b); - - for (i = 0; i < group_count; i++) { - max = std::max(max, m[i]); - max = std::max(max, n[i]); - size_a = (left_right[i] == onemkl::side::left) ? m[i] * lda[i] : n[i] * lda[i]; - size_b = ldb[i] * n[i]; - for (j = 0; j < group_size[i]; j++) { - a_array[idx] = A.data() + off_a; - b_array[idx] = B.data() + off_b; - b_ref_array[idx] = B_ref.data() + off_b; - if (left_right[i] == onemkl::side::left) - rand_trsm_matrix(a_array[idx], trans[i], m[i], m[i], lda[i]); - else - rand_trsm_matrix(a_array[idx], trans[i], n[i], n[i], lda[i]); - rand_matrix(b_array[idx], onemkl::transpose::nontrans, m[i], n[i], ldb[i]); - off_a += size_a; - off_b += size_b; - idx++; - } - } - - B_ref = B; - - // Call reference TRSM_BATCH. - using fp_ref = typename ref_type_info::type; - int m_ref, n_ref, lda_ref, ldb_ref, group_size_ref; - CBLAS_TRANSPOSE trans_ref; - CBLAS_SIDE side_ref; - CBLAS_DIAG diag_ref; - CBLAS_UPLO uplo_ref; - idx = 0; - for (i = 0; i < group_count; i++) { - m_ref = (int)m[i]; - n_ref = (int)n[i]; - lda_ref = (int)lda[i]; - ldb_ref = (int)ldb[i]; - group_size_ref = (int)group_size[i]; - trans_ref = convert_to_cblas_trans(trans[i]); - side_ref = convert_to_cblas_side(left_right[i]); - diag_ref = convert_to_cblas_diag(unit_diag[i]); - uplo_ref = convert_to_cblas_uplo(upper_lower[i]); - for (j = 0; j < group_size_ref; j++) { - ::trsm(side_ref, uplo_ref, trans_ref, diag_ref, (const int *)&m_ref, - (const int *)&n_ref, (const fp_ref *)&alpha[i], (const fp_ref *)a_array[idx], - (const int *)&lda_ref, (fp_ref *)b_ref_array[idx], (const int *)&ldb_ref); - idx++; - } - } - - // Call DPC++ TRSM_BATCH. - - // Catch asynchronous exceptions. - auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { - try { - std::rethrow_exception(e); - } - catch (exception const &e) { - std::cout << "Caught asynchronous SYCL exception during TRSM_BATCH:\n" - << e.what() << std::endl - << "OpenCL status: " << e.get_cl_code() << std::endl; - } - } - }; - - queue main_queue(dev, exception_handler); - - buffer side_buffer(left_right, range<1>(group_count)); - buffer uplo_buffer(upper_lower, range<1>(group_count)); - buffer trans_buffer(trans, range<1>(group_count)); - buffer diag_buffer(unit_diag, range<1>(group_count)); - buffer m_buffer(m, range<1>(group_count)); - buffer n_buffer(n, range<1>(group_count)); - buffer lda_buffer(lda, range<1>(group_count)); - buffer ldb_buffer(ldb, range<1>(group_count)); - buffer group_size_buffer(group_size, range<1>(group_count)); - buffer alpha_buffer(alpha, range<1>(group_count)); - buffer A_buffer(A.data(), range<1>(A.size())); - buffer B_buffer(B.data(), range<1>(B.size())); - - try { -#ifdef CALL_RT_API - onemkl::blas::trsm_batch(main_queue, side_buffer, uplo_buffer, trans_buffer, diag_buffer, - m_buffer, n_buffer, alpha_buffer, A_buffer, lda_buffer, B_buffer, - ldb_buffer, group_count, group_size_buffer); -#else - TEST_RUN_CT(main_queue, onemkl::blas::trsm_batch, - (main_queue, side_buffer, uplo_buffer, trans_buffer, diag_buffer, m_buffer, - n_buffer, alpha_buffer, A_buffer, lda_buffer, B_buffer, ldb_buffer, - group_count, group_size_buffer)); -#endif - } - catch (exception const &e) { - std::cout << "Caught synchronous SYCL exception during TRSM_BATCH:\n" - << e.what() << std::endl - << "OpenCL status: " << e.get_cl_code() << std::endl; - } - - catch (const std::runtime_error &error) { - std::cout << "Error raised during execution of TRSM_BATCH:\n" << error.what() << std::endl; -#ifdef ENABLE_CUBLAS_BACKEND - // TRSM_BATCH currently not supported with CUBLAS backend. - std::string error_msg(error.what()); - if (error_msg.compare("Not implemented for cublas") == 0) { - onemkl::aligned_free(m); - onemkl::aligned_free(n); - onemkl::aligned_free(lda); - onemkl::aligned_free(ldb); - onemkl::aligned_free(trans); - onemkl::aligned_free(left_right); - onemkl::aligned_free(upper_lower); - onemkl::aligned_free(unit_diag); - onemkl::aligned_free(alpha); - onemkl::aligned_free(group_size); - onemkl::aligned_free(a_array); - onemkl::aligned_free(b_array); - onemkl::aligned_free(b_ref_array); - return true; - } -#endif - } - - // Compare the results of reference implementation and DPC++ implementation. - bool good; - { - auto B_accessor = B_buffer.template get_access(); - good = check_equal_trsm_matrix(B_accessor, B_ref, total_size_b, 1, total_size_b, 10 * max, - std::cout); - } - - onemkl::aligned_free(m); - onemkl::aligned_free(n); - onemkl::aligned_free(lda); - onemkl::aligned_free(ldb); - onemkl::aligned_free(trans); - onemkl::aligned_free(left_right); - onemkl::aligned_free(upper_lower); - onemkl::aligned_free(unit_diag); - onemkl::aligned_free(alpha); - onemkl::aligned_free(group_size); - onemkl::aligned_free(a_array); - onemkl::aligned_free(b_array); - onemkl::aligned_free(b_ref_array); - - return good; -} - -class TrsmBatchTests : public ::testing::TestWithParam {}; - -TEST_P(TrsmBatchTests, RealSinglePrecision) { - EXPECT_TRUE(test(GetParam(), 5)); -} - -TEST_P(TrsmBatchTests, RealDoublePrecision) { - EXPECT_TRUE(test(GetParam(), 5)); -} - -TEST_P(TrsmBatchTests, ComplexSinglePrecision) { - EXPECT_TRUE(test>(GetParam(), 5)); -} - -TEST_P(TrsmBatchTests, ComplexDoublePrecision) { - EXPECT_TRUE(test>(GetParam(), 5)); -} - -INSTANTIATE_TEST_SUITE_P(TrsmBatchTestSuite, TrsmBatchTests, ::testing::ValuesIn(devices), - ::DeviceNamePrint()); - -} // anonymous namespace diff --git a/tests/unit_tests/blas/batch/trsm_batch_stride.cpp b/tests/unit_tests/blas/batch/trsm_batch_stride.cpp index 6b1bd5fe0..5b461d198 100644 --- a/tests/unit_tests/blas/batch/trsm_batch_stride.cpp +++ b/tests/unit_tests/blas/batch/trsm_batch_stride.cpp @@ -44,7 +44,7 @@ extern std::vector devices; namespace { template -bool test(const device &dev) { +int test(const device &dev) { // Prepare data. int64_t m, n; int64_t lda, ldb; @@ -151,16 +151,13 @@ bool test(const device &dev) { << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + catch (const std::runtime_error &error) { std::cout << "Error raised during execution of TRSM_BATCH_STRIDE:\n" << error.what() << std::endl; -#ifdef ENABLE_CUBLAS_BACKEND - // TRSM_BATCH_STRIDE currently not supported with CUBLAS backend. - std::string error_msg(error.what()); - if (error_msg.compare("Not implemented for cublas") == 0) { - return true; - } -#endif } // Compare the results of reference implementation and DPC++ implementation. @@ -171,25 +168,25 @@ bool test(const device &dev) { 10 * std::max(m, n), std::cout); } - return good; + return (int)good; } class TrsmBatchStrideTests : public ::testing::TestWithParam {}; TEST_P(TrsmBatchStrideTests, RealSinglePrecision) { - EXPECT_TRUE(test(GetParam())); + EXPECT_TRUEORSKIP(test(GetParam())); } TEST_P(TrsmBatchStrideTests, RealDoublePrecision) { - EXPECT_TRUE(test(GetParam())); + EXPECT_TRUEORSKIP(test(GetParam())); } TEST_P(TrsmBatchStrideTests, ComplexSinglePrecision) { - EXPECT_TRUE(test>(GetParam())); + EXPECT_TRUEORSKIP(test>(GetParam())); } TEST_P(TrsmBatchStrideTests, ComplexDoublePrecision) { - EXPECT_TRUE(test>(GetParam())); + EXPECT_TRUEORSKIP(test>(GetParam())); } INSTANTIATE_TEST_SUITE_P(TrsmBatchStrideTestSuite, TrsmBatchStrideTests, diff --git a/tests/unit_tests/blas/extensions/CMakeLists.txt b/tests/unit_tests/blas/extensions/CMakeLists.txt index 1a248e5ec..6401a55ac 100644 --- a/tests/unit_tests/blas/extensions/CMakeLists.txt +++ b/tests/unit_tests/blas/extensions/CMakeLists.txt @@ -18,7 +18,7 @@ #=============================================================================== # Build object from all test sources -set(EXTENSIONS_SOURCES "gemm_ext.cpp" "gemm_ext_off.cpp" "gemmt.cpp") +set(EXTENSIONS_SOURCES "gemm_ext.cpp" "gemm_ext_off.cpp" "gemmt.cpp" "gemmt_usm.cpp") if(BUILD_SHARED_LIBS) add_library(blas_extensions_rt OBJECT ${EXTENSIONS_SOURCES}) diff --git a/tests/unit_tests/blas/extensions/gemm_ext.cpp b/tests/unit_tests/blas/extensions/gemm_ext.cpp index b951fed30..d1633431b 100644 --- a/tests/unit_tests/blas/extensions/gemm_ext.cpp +++ b/tests/unit_tests/blas/extensions/gemm_ext.cpp @@ -44,8 +44,8 @@ extern std::vector devices; namespace { template -bool test(const device& dev, onemkl::transpose transa, onemkl::transpose transb, int m, int n, - int k, int lda, int ldb, int ldc, Tc alpha, Tc beta) { +int test(const device& dev, onemkl::transpose transa, onemkl::transpose transb, int m, int n, int k, + int lda, int ldb, int ldc, Tc alpha, Tc beta) { // Prepare data. vector> A, B; vector> C, C_ref; @@ -103,22 +103,19 @@ bool test(const device& dev, onemkl::transpose transa, onemkl::transpose transb, << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of GEMM_EXT:\n" << error.what() << std::endl; -#ifdef ENABLE_CUBLAS_BACKEND - // GEMM_EXT currently not supported with CUBLAS backend. - std::string error_msg(error.what()); - if (error_msg.compare("Not implemented for cublas") == 0) { - return true; - } -#endif } // Compare the results of reference implementation and DPC++ implementation. auto C_accessor = C_buffer.template get_access(); bool good = check_equal_matrix(C_accessor, C_ref, m, n, ldc, 10 * k, std::cout); - return good; + return (int)good; } class GemmExtTests : public ::testing::TestWithParam {}; @@ -126,62 +123,67 @@ class GemmExtTests : public ::testing::TestWithParam {}; TEST_P(GemmExtTests, HalfHalfFloatPrecision) { float alpha(2.0); float beta(3.0); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( (test(GetParam(), onemkl::transpose::nontrans, onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( (test(GetParam(), onemkl::transpose::nontrans, onemkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( (test(GetParam(), onemkl::transpose::trans, onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test(GetParam(), onemkl::transpose::trans, onemkl::transpose::trans, - 79, 83, 91, 103, 105, 106, alpha, beta))); + EXPECT_TRUEORSKIP( + (test(GetParam(), onemkl::transpose::trans, onemkl::transpose::trans, 79, 83, + 91, 103, 105, 106, alpha, beta))); } TEST_P(GemmExtTests, RealHalfPrecision) { half alpha(2.0); half beta(3.0); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( (test(GetParam(), onemkl::transpose::nontrans, onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test(GetParam(), onemkl::transpose::nontrans, onemkl::transpose::trans, - 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test(GetParam(), onemkl::transpose::trans, onemkl::transpose::nontrans, - 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test(GetParam(), onemkl::transpose::trans, onemkl::transpose::trans, - 79, 83, 91, 103, 105, 106, alpha, beta))); + EXPECT_TRUEORSKIP( + (test(GetParam(), onemkl::transpose::nontrans, onemkl::transpose::trans, 79, 83, + 91, 103, 105, 106, alpha, beta))); + EXPECT_TRUEORSKIP( + (test(GetParam(), onemkl::transpose::trans, onemkl::transpose::nontrans, 79, 83, + 91, 103, 105, 106, alpha, beta))); + EXPECT_TRUEORSKIP( + (test(GetParam(), onemkl::transpose::trans, onemkl::transpose::trans, 79, 83, + 91, 103, 105, 106, alpha, beta))); } TEST_P(GemmExtTests, RealSinglePrecision) { float alpha(2.0); float beta(3.0); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( (test(GetParam(), onemkl::transpose::nontrans, onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( (test(GetParam(), onemkl::transpose::nontrans, onemkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( (test(GetParam(), onemkl::transpose::trans, onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test(GetParam(), onemkl::transpose::trans, onemkl::transpose::trans, - 79, 83, 91, 103, 105, 106, alpha, beta))); + EXPECT_TRUEORSKIP( + (test(GetParam(), onemkl::transpose::trans, onemkl::transpose::trans, 79, 83, + 91, 103, 105, 106, alpha, beta))); } TEST_P(GemmExtTests, RealDoublePrecision) { double alpha(2.0); double beta(3.0); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( (test(GetParam(), onemkl::transpose::nontrans, onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( (test(GetParam(), onemkl::transpose::nontrans, onemkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( (test(GetParam(), onemkl::transpose::trans, onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( (test(GetParam(), onemkl::transpose::trans, onemkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta))); } @@ -189,31 +191,31 @@ TEST_P(GemmExtTests, RealDoublePrecision) { TEST_P(GemmExtTests, ComplexSinglePrecision) { std::complex alpha(2.0, -0.5); std::complex beta(3.0, -1.5); - EXPECT_TRUE((test, std::complex>( + EXPECT_TRUEORSKIP((test, std::complex>( GetParam(), onemkl::transpose::nontrans, onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test, std::complex>( + EXPECT_TRUEORSKIP((test, std::complex>( GetParam(), onemkl::transpose::nontrans, onemkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test, std::complex>( + EXPECT_TRUEORSKIP((test, std::complex>( GetParam(), onemkl::transpose::trans, onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test, std::complex>( + EXPECT_TRUEORSKIP((test, std::complex>( GetParam(), onemkl::transpose::trans, onemkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test, std::complex>( + EXPECT_TRUEORSKIP((test, std::complex>( GetParam(), onemkl::transpose::nontrans, onemkl::transpose::conjtrans, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test, std::complex>( + EXPECT_TRUEORSKIP((test, std::complex>( GetParam(), onemkl::transpose::trans, onemkl::transpose::conjtrans, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test, std::complex>( + EXPECT_TRUEORSKIP((test, std::complex>( GetParam(), onemkl::transpose::conjtrans, onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test, std::complex>( + EXPECT_TRUEORSKIP((test, std::complex>( GetParam(), onemkl::transpose::conjtrans, onemkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test, std::complex>( + EXPECT_TRUEORSKIP((test, std::complex>( GetParam(), onemkl::transpose::conjtrans, onemkl::transpose::conjtrans, 79, 83, 91, 103, 105, 106, alpha, beta))); } @@ -221,31 +223,31 @@ TEST_P(GemmExtTests, ComplexSinglePrecision) { TEST_P(GemmExtTests, ComplexDoublePrecision) { std::complex alpha(2.0, -0.5); std::complex beta(3.0, -1.5); - EXPECT_TRUE((test, std::complex>( + EXPECT_TRUEORSKIP((test, std::complex>( GetParam(), onemkl::transpose::nontrans, onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test, std::complex>( + EXPECT_TRUEORSKIP((test, std::complex>( GetParam(), onemkl::transpose::nontrans, onemkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test, std::complex>( + EXPECT_TRUEORSKIP((test, std::complex>( GetParam(), onemkl::transpose::trans, onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test, std::complex>( + EXPECT_TRUEORSKIP((test, std::complex>( GetParam(), onemkl::transpose::trans, onemkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test, std::complex>( + EXPECT_TRUEORSKIP((test, std::complex>( GetParam(), onemkl::transpose::nontrans, onemkl::transpose::conjtrans, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test, std::complex>( + EXPECT_TRUEORSKIP((test, std::complex>( GetParam(), onemkl::transpose::trans, onemkl::transpose::conjtrans, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test, std::complex>( + EXPECT_TRUEORSKIP((test, std::complex>( GetParam(), onemkl::transpose::conjtrans, onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test, std::complex>( + EXPECT_TRUEORSKIP((test, std::complex>( GetParam(), onemkl::transpose::conjtrans, onemkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test, std::complex>( + EXPECT_TRUEORSKIP((test, std::complex>( GetParam(), onemkl::transpose::conjtrans, onemkl::transpose::conjtrans, 79, 83, 91, 103, 105, 106, alpha, beta))); } diff --git a/tests/unit_tests/blas/extensions/gemm_ext_off.cpp b/tests/unit_tests/blas/extensions/gemm_ext_off.cpp index 46c4ef477..9e28bc57f 100644 --- a/tests/unit_tests/blas/extensions/gemm_ext_off.cpp +++ b/tests/unit_tests/blas/extensions/gemm_ext_off.cpp @@ -44,9 +44,9 @@ extern std::vector devices; namespace { template -bool test(const device& dev, onemkl::transpose transa, onemkl::transpose transb, - onemkl::offset offsetc, int m, int n, int k, int lda, int ldb, int ldc, Ts alpha, - Ts beta) { +int test(const device& dev, onemkl::transpose transa, onemkl::transpose transb, + onemkl::offset offsetc, int m, int n, int k, int lda, int ldb, int ldc, Ts alpha, + Ts beta) { // Prepare data. vector> A; vector> B; @@ -67,7 +67,7 @@ bool test(const device& dev, onemkl::transpose transa, onemkl::transpose transb, C_ref = C; - // Call Reference GEMM_EXT. + // Call Reference GEMM_EXT_OFF. const int m_ref = m, n_ref = n, k_ref = k; const int lda_ref = lda, ldb_ref = ldb, ldc_ref = ldc; @@ -81,7 +81,7 @@ bool test(const device& dev, onemkl::transpose transa, onemkl::transpose transb, (Ta_ref*)A.data(), &lda_ref, (Ta_ref*)&ao, (Tb_ref*)B.data(), &ldb_ref, (Tb_ref*)&bo, (Ts_ref*)&beta, (Tc_ref*)C_ref.data(), &ldc_ref, (Tc_ref*)co.data()); - // Call DPC++ GEMM_EXT. + // Call DPC++ GEMM_EXT_OFF. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { @@ -90,7 +90,7 @@ bool test(const device& dev, onemkl::transpose transa, onemkl::transpose transb, std::rethrow_exception(e); } catch (exception const& e) { - std::cout << "Caught asynchronous SYCL exception during GEMM_EXT:\n" + std::cout << "Caught asynchronous SYCL exception during GEMM_EXT_OFF:\n" << e.what() << std::endl << "OpenCL status: " << e.get_cl_code() << std::endl; } @@ -115,27 +115,25 @@ bool test(const device& dev, onemkl::transpose transa, onemkl::transpose transb, #endif } catch (exception const& e) { - std::cout << "Caught synchronous SYCL exception during GEMM_EXT:\n" + std::cout << "Caught synchronous SYCL exception during GEMM_EXT_OFF:\n" << e.what() << std::endl << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + catch (const std::runtime_error& error) { - std::cout << "Error raised during execution of GEMM_EXT:\n" << error.what() << std::endl; -#ifdef ENABLE_CUBLAS_BACKEND - // GEMM_EXT currently not supported with CUBLAS backend. - std::string error_msg(error.what()); - if (error_msg.compare("Not implemented for cublas") == 0) { - return true; - } -#endif + std::cout << "Error raised during execution of GEMM_EXT_OFF:\n" + << error.what() << std::endl; } // Compare the results of reference implementation and DPC++ implementation. auto C_accessor = C_buffer.template get_access(); bool good = check_equal_matrix(C_accessor, C_ref, m, n, ldc, 10 * k, std::cout); - return good; + return (int)good; } class GemmExtOffTests : public ::testing::TestWithParam {}; @@ -143,40 +141,40 @@ class GemmExtOffTests : public ::testing::TestWithParam {}; TEST_P(GemmExtOffTests, Int8Uint8Int32Precision) { float alpha(2.0); float beta(3.0); - EXPECT_TRUE((test( + EXPECT_TRUEORSKIP((test( GetParam(), onemkl::transpose::nontrans, onemkl::transpose::nontrans, onemkl::offset::fix, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test( + EXPECT_TRUEORSKIP((test( GetParam(), onemkl::transpose::nontrans, onemkl::transpose::trans, onemkl::offset::fix, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test( + EXPECT_TRUEORSKIP((test( GetParam(), onemkl::transpose::trans, onemkl::transpose::nontrans, onemkl::offset::fix, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test( + EXPECT_TRUEORSKIP((test( GetParam(), onemkl::transpose::trans, onemkl::transpose::trans, onemkl::offset::fix, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test( + EXPECT_TRUEORSKIP((test( GetParam(), onemkl::transpose::nontrans, onemkl::transpose::nontrans, onemkl::offset::column, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test( + EXPECT_TRUEORSKIP((test( GetParam(), onemkl::transpose::nontrans, onemkl::transpose::trans, onemkl::offset::column, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test( + EXPECT_TRUEORSKIP((test( GetParam(), onemkl::transpose::trans, onemkl::transpose::nontrans, onemkl::offset::column, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test( + EXPECT_TRUEORSKIP((test( GetParam(), onemkl::transpose::trans, onemkl::transpose::trans, onemkl::offset::column, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test( + EXPECT_TRUEORSKIP((test( GetParam(), onemkl::transpose::nontrans, onemkl::transpose::nontrans, onemkl::offset::row, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test( + EXPECT_TRUEORSKIP((test( GetParam(), onemkl::transpose::nontrans, onemkl::transpose::trans, onemkl::offset::row, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test( + EXPECT_TRUEORSKIP((test( GetParam(), onemkl::transpose::trans, onemkl::transpose::nontrans, onemkl::offset::row, 79, 83, 91, 103, 105, 106, alpha, beta))); - EXPECT_TRUE((test( + EXPECT_TRUEORSKIP((test( GetParam(), onemkl::transpose::trans, onemkl::transpose::trans, onemkl::offset::row, 79, 83, 91, 103, 105, 106, alpha, beta))); } diff --git a/tests/unit_tests/blas/extensions/gemmt.cpp b/tests/unit_tests/blas/extensions/gemmt.cpp index 6c2605a61..6f95bd33e 100644 --- a/tests/unit_tests/blas/extensions/gemmt.cpp +++ b/tests/unit_tests/blas/extensions/gemmt.cpp @@ -44,8 +44,8 @@ extern std::vector devices; namespace { template -bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, - onemkl::transpose transb, int n, int k, int lda, int ldb, int ldc, fp alpha, fp beta) { +int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, + onemkl::transpose transb, int n, int k, int lda, int ldb, int ldc, fp alpha, fp beta) { // Prepare data. vector> A, B, C, C_ref; rand_matrix(A, transa, n, k, lda); @@ -101,22 +101,19 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of GEMMT:\n" << error.what() << std::endl; -#ifdef ENABLE_CUBLAS_BACKEND - // GEMMT currently not supported with CUBLAS backend. - std::string error_msg(error.what()); - if (error_msg.compare("Not implemented for cublas") == 0) { - return true; - } -#endif } // Compare the results of reference implementation and DPC++ implementation. auto C_accessor = C_buffer.template get_access(); bool good = check_equal_matrix(C_accessor, C_ref, upper_lower, n, n, ldc, 10 * k, std::cout); - return good; + return (int)good; } class GemmtTests : public ::testing::TestWithParam {}; @@ -124,100 +121,104 @@ class GemmtTests : public ::testing::TestWithParam {}; TEST_P(GemmtTests, RealSinglePrecision) { float alpha(2.0); float beta(3.0); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, - onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, - onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, - onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, - onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); } TEST_P(GemmtTests, RealDoublePrecision) { double alpha(2.0); double beta(3.0); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, - onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, - onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, - onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, - onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, + beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, + beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, + beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, + beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); } TEST_P(GemmtTests, ComplexSinglePrecision) { std::complex alpha(2.0); std::complex beta(3.0); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::transpose::nontrans, - 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::transpose::trans, 27, - 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, - beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, - beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::transpose::conjtrans, - 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::transpose::conjtrans, 27, 98, 101, 102, 103, - alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::transpose::nontrans, - 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::transpose::trans, - 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>( + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::transpose::trans, + 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, onemkl::transpose::conjtrans, + 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::transpose::conjtrans, 27, + 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, + onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, onemkl::transpose::conjtrans, 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::transpose::nontrans, - 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::transpose::trans, 27, - 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, - beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, - beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::transpose::conjtrans, - 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::transpose::conjtrans, 27, 98, 101, 102, 103, - alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::transpose::nontrans, - 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::transpose::trans, - 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>( + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::transpose::trans, + 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, onemkl::transpose::conjtrans, + 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::transpose::conjtrans, 27, + 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, + onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, onemkl::transpose::conjtrans, 27, 98, 101, 102, 103, alpha, beta)); } @@ -225,58 +226,58 @@ TEST_P(GemmtTests, ComplexSinglePrecision) { TEST_P(GemmtTests, ComplexDoublePrecision) { std::complex alpha(2.0); std::complex beta(3.0); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::transpose::nontrans, - 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::transpose::trans, - 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::transpose::nontrans, - 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::transpose::trans, 27, - 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>( + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, onemkl::transpose::nontrans, + 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::transpose::nontrans, 27, + 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::transpose::trans, + 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, onemkl::transpose::conjtrans, 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::transpose::conjtrans, - 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>( + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::transpose::conjtrans, 27, + 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::transpose::trans, - 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>( + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, onemkl::transpose::conjtrans, 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::transpose::nontrans, - 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::transpose::trans, - 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::transpose::nontrans, - 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::transpose::trans, 27, - 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>( + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, onemkl::transpose::nontrans, + 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::transpose::nontrans, 27, + 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::transpose::trans, + 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, onemkl::transpose::conjtrans, 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::transpose::conjtrans, - 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>( + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::transpose::conjtrans, 27, + 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::transpose::trans, - 27, 98, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>( + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, onemkl::transpose::conjtrans, 27, 98, 101, 102, 103, alpha, beta)); } diff --git a/tests/unit_tests/blas/extensions/gemmt_usm.cpp b/tests/unit_tests/blas/extensions/gemmt_usm.cpp new file mode 100644 index 000000000..b70aad7d1 --- /dev/null +++ b/tests/unit_tests/blas/extensions/gemmt_usm.cpp @@ -0,0 +1,289 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, + onemkl::transpose transb, int n, int k, int lda, int ldb, int ldc, fp alpha, fp beta) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const& e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const& e) { + std::cout << "Caught asynchronous SYCL exception during GEMMT:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector A(ua), B(ua), C(ua); + rand_matrix(A, transa, n, k, lda); + rand_matrix(B, transb, k, n, ldb); + rand_matrix(C, onemkl::transpose::nontrans, n, n, ldc); + + auto C_ref = C; + + // Call Reference GEMMT. + const int n_ref = n, k_ref = k; + const int lda_ref = lda, ldb_ref = ldb, ldc_ref = ldc; + + using fp_ref = typename ref_type_info::type; + + ::gemmt(convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_trans(transb), &n_ref, &k_ref, (fp_ref*)&alpha, (fp_ref*)A.data(), + &lda_ref, (fp_ref*)B.data(), &ldb_ref, (fp_ref*)&beta, (fp_ref*)C_ref.data(), &ldc_ref); + + // Call DPC++ GEMMT. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::gemmt(main_queue, upper_lower, transa, transb, n, k, alpha, A.data(), + lda, B.data(), ldb, beta, C.data(), ldc, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::gemmt, + (main_queue, upper_lower, transa, transb, n, k, alpha, A.data(), lda, B.data(), + ldb, beta, C.data(), ldc, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const& e) { + std::cout << "Caught synchronous SYCL exception during GEMMT:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of GEMMT:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + bool good = check_equal_matrix(C, C_ref, upper_lower, n, n, ldc, 10 * k, std::cout); + + return (int)good; +} + +class GemmtUsmTests : public ::testing::TestWithParam {}; + +TEST_P(GemmtUsmTests, RealSinglePrecision) { + float alpha(2.0); + float beta(3.0); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); +} + +TEST_P(GemmtUsmTests, RealDoublePrecision) { + double alpha(2.0); + double beta(3.0); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, + beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, + beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, + beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, + beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); +} + +TEST_P(GemmtUsmTests, ComplexSinglePrecision) { + std::complex alpha(2.0); + std::complex beta(3.0); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::transpose::trans, + 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, onemkl::transpose::conjtrans, + 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::transpose::conjtrans, 27, + 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, + onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, onemkl::transpose::conjtrans, + 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::transpose::trans, + 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, onemkl::transpose::conjtrans, + 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::transpose::conjtrans, 27, + 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, + onemkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, onemkl::transpose::conjtrans, + 27, 98, 101, 102, 103, alpha, beta)); +} + +TEST_P(GemmtUsmTests, ComplexDoublePrecision) { + std::complex alpha(2.0); + std::complex beta(3.0); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, onemkl::transpose::nontrans, + 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::transpose::nontrans, 27, + 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::transpose::trans, + 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, onemkl::transpose::conjtrans, + 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::transpose::conjtrans, 27, + 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, onemkl::transpose::nontrans, + 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, onemkl::transpose::conjtrans, + 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, onemkl::transpose::nontrans, + 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::transpose::nontrans, 27, + 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::transpose::trans, + 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, onemkl::transpose::conjtrans, + 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::transpose::conjtrans, 27, + 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, onemkl::transpose::nontrans, + 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, + onemkl::transpose::trans, 27, 98, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, onemkl::transpose::conjtrans, + 27, 98, 101, 102, 103, alpha, beta)); +} + +INSTANTIATE_TEST_SUITE_P(GemmtUsmTestSuite, GemmtUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/include/test_common.hpp b/tests/unit_tests/blas/include/test_common.hpp index badfbbb59..9c794f833 100644 --- a/tests/unit_tests/blas/include/test_common.hpp +++ b/tests/unit_tests/blas/include/test_common.hpp @@ -23,10 +23,19 @@ #include #include +#include #include #include +// Exceptions +namespace onemkl { +class backend_unsupported_exception : public std::runtime_error { +public: + backend_unsupported_exception() : std::runtime_error("Not yet supported for this backend") {} +}; +} // namespace onemkl + namespace std { static cl::sycl::half abs(cl::sycl::half v) { if (v < cl::sycl::half(0)) @@ -161,6 +170,13 @@ std::complex rand_scalar(int mag) { return rand_complex_scalar(mag); } +template +void rand_vector(fp *v, int n, int inc) { + int abs_inc = std::abs(inc); + for (int i = 0; i < n; i++) + v[i * abs_inc] = rand_scalar(); +} + template void rand_vector(vec &v, int n, int inc) { using fp = typename vec::value_type; @@ -186,6 +202,13 @@ void print_matrix(vec &M, onemkl::transpose trans, int m, int n, int ld, char *n } } +template +void copy_vector(fp *src, int n, int inc, fp *dest) { + int abs_inc = std::abs(inc); + for (int i = 0; i < n; i++) + dest[i * abs_inc] = src[i * abs_inc]; +} + template void copy_matrix(vec_src &src, onemkl::transpose trans, int m, int n, int ld, vec_dest &dest) { using T_data = typename vec_dest::value_type; @@ -202,6 +225,20 @@ void copy_matrix(vec_src &src, onemkl::transpose trans, int m, int n, int ld, ve } } +template +void copy_matrix(fp *src, onemkl::transpose trans, int m, int n, int ld, fp *dest) { + if (trans == onemkl::transpose::nontrans) { + for (int j = 0; j < n; j++) + for (int i = 0; i < m; i++) + dest[i + j * ld] = (fp)src[i + j * ld]; + } + else { + for (int i = 0; i < m; i++) + for (int j = 0; j < n; j++) + dest[j + i * ld] = (fp)src[j + i * ld]; + } +} + template void rand_matrix(vec &M, onemkl::transpose trans, int m, int n, int ld) { using fp = typename vec::value_type; diff --git a/tests/unit_tests/blas/level1/CMakeLists.txt b/tests/unit_tests/blas/level1/CMakeLists.txt index f5213d0ef..08c977739 100644 --- a/tests/unit_tests/blas/level1/CMakeLists.txt +++ b/tests/unit_tests/blas/level1/CMakeLists.txt @@ -18,7 +18,7 @@ #=============================================================================== # Build object from all test sources -set(L1_SOURCES "nrm2.cpp" "iamin.cpp" "iamax.cpp" "dotu.cpp" "dot.cpp" "dotc.cpp" "copy.cpp" "axpy.cpp" "asum.cpp" "swap.cpp" "sdsdot.cpp" "scal.cpp" "rotmg.cpp" "rotm.cpp" "rotg.cpp" "rot.cpp") +set(L1_SOURCES "nrm2.cpp" "iamin.cpp" "iamax.cpp" "dotu.cpp" "dot.cpp" "dotc.cpp" "copy.cpp" "axpy.cpp" "asum.cpp" "swap.cpp" "sdsdot.cpp" "scal.cpp" "rotmg.cpp" "rotm.cpp" "rotg.cpp" "rot.cpp" "nrm2_usm.cpp" "iamin_usm.cpp" "iamax_usm.cpp" "dotu_usm.cpp" "dot_usm.cpp" "dotc_usm.cpp" "copy_usm.cpp" "axpy_usm.cpp" "asum_usm.cpp" "swap_usm.cpp" "sdsdot_usm.cpp" "scal_usm.cpp" "rotmg_usm.cpp" "rotm_usm.cpp" "rotg_usm.cpp" "rot_usm.cpp") if(BUILD_SHARED_LIBS) add_library(blas_level1_rt OBJECT ${L1_SOURCES}) diff --git a/tests/unit_tests/blas/level1/asum.cpp b/tests/unit_tests/blas/level1/asum.cpp index 2b192f79d..54d559064 100644 --- a/tests/unit_tests/blas/level1/asum.cpp +++ b/tests/unit_tests/blas/level1/asum.cpp @@ -23,7 +23,6 @@ #include #include -#include #include #include "cblas.h" #include "onemkl/detail/config.hpp" @@ -33,6 +32,8 @@ #include "test_common.hpp" #include "test_helper.hpp" +#include + using namespace cl::sycl; using std::vector; @@ -41,7 +42,7 @@ extern std::vector devices; namespace { template -bool test(const device& dev, int64_t N, int64_t incx) { +int test(const device& dev, int64_t N, int64_t incx) { // Prepare data. vector x; fp_res result = fp_res(-1), result_ref = fp_res(-1); @@ -87,6 +88,14 @@ bool test(const device& dev, int64_t N, int64_t incx) { << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of ASUM:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -94,33 +103,33 @@ bool test(const device& dev, int64_t N, int64_t incx) { good = check_equal(result_accessor[0], result_ref, N, std::cout); } - return good; + return (int)good; } class AsumTests : public ::testing::TestWithParam {}; TEST_P(AsumTests, RealSinglePrecision) { - EXPECT_TRUE((::test(GetParam(), 1357, 2))); - EXPECT_TRUE((::test(GetParam(), 1357, 1))); - EXPECT_TRUE((::test(GetParam(), 1357, -3))); + EXPECT_TRUEORSKIP((::test(GetParam(), 1357, 2))); + EXPECT_TRUEORSKIP((::test(GetParam(), 1357, 1))); + EXPECT_TRUEORSKIP((::test(GetParam(), 1357, -3))); } TEST_P(AsumTests, RealDoublePrecision) { - EXPECT_TRUE((::test(GetParam(), 1357, 2))); - EXPECT_TRUE((::test(GetParam(), 1357, 1))); - EXPECT_TRUE((::test(GetParam(), 1357, -3))); + EXPECT_TRUEORSKIP((::test(GetParam(), 1357, 2))); + EXPECT_TRUEORSKIP((::test(GetParam(), 1357, 1))); + EXPECT_TRUEORSKIP((::test(GetParam(), 1357, -3))); } TEST_P(AsumTests, ComplexSinglePrecision) { - EXPECT_TRUE((::test, float>(GetParam(), 1357, 2))); - EXPECT_TRUE((::test, float>(GetParam(), 1357, 1))); - EXPECT_TRUE((::test, float>(GetParam(), 1357, -3))); + EXPECT_TRUEORSKIP((::test, float>(GetParam(), 1357, 2))); + EXPECT_TRUEORSKIP((::test, float>(GetParam(), 1357, 1))); + EXPECT_TRUEORSKIP((::test, float>(GetParam(), 1357, -3))); } TEST_P(AsumTests, ComplexDoublePrecision) { - EXPECT_TRUE((test, double>(GetParam(), 1357, 2))); - EXPECT_TRUE((test, double>(GetParam(), 1357, 1))); - EXPECT_TRUE((test, double>(GetParam(), 1357, -3))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), 1357, 2))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), 1357, 1))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), 1357, -3))); } INSTANTIATE_TEST_SUITE_P(AsumTestSuite, AsumTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level1/asum_usm.cpp b/tests/unit_tests/blas/level1/asum_usm.cpp new file mode 100644 index 000000000..5d3ec812d --- /dev/null +++ b/tests/unit_tests/blas/level1/asum_usm.cpp @@ -0,0 +1,144 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device& dev, int64_t N, int64_t incx) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const& e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const& e) { + std::cout << "Caught asynchronous SYCL exception during ASUM:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua); + fp_res result_ref = fp_res(-1); + + rand_vector(x, N, incx); + + // Call Reference ASUM. + using fp_ref = typename ref_type_info::type; + const int N_ref = N, incx_ref = std::abs(incx); + + result_ref = ::asum(&N_ref, (fp_ref*)x.data(), &incx_ref); + + // Call DPC++ ASUM. + + auto result_p = (fp_res*)onemkl::malloc_shared(64, sizeof(fp_res), dev, cxt); + + try { +#ifdef CALL_RT_API + done = onemkl::blas::asum(main_queue, N, x.data(), incx, result_p, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::asum, + (main_queue, N, x.data(), incx, result_p, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const& e) { + std::cout << "Caught synchronous SYCL exception during ASUM:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of ASUM:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal(*result_p, result_ref, N, std::cout); + + onemkl::free_shared(result_p, cxt); + + return (int)good; +} + +class AsumUsmTests : public ::testing::TestWithParam {}; + +TEST_P(AsumUsmTests, RealSinglePrecision) { + EXPECT_TRUEORSKIP((::test(GetParam(), 1357, 2))); + EXPECT_TRUEORSKIP((::test(GetParam(), 1357, 1))); + EXPECT_TRUEORSKIP((::test(GetParam(), 1357, -3))); +} + +TEST_P(AsumUsmTests, RealDoublePrecision) { + EXPECT_TRUEORSKIP((::test(GetParam(), 1357, 2))); + EXPECT_TRUEORSKIP((::test(GetParam(), 1357, 1))); + EXPECT_TRUEORSKIP((::test(GetParam(), 1357, -3))); +} + +TEST_P(AsumUsmTests, ComplexSinglePrecision) { + EXPECT_TRUEORSKIP((::test, float>(GetParam(), 1357, 2))); + EXPECT_TRUEORSKIP((::test, float>(GetParam(), 1357, 1))); + EXPECT_TRUEORSKIP((::test, float>(GetParam(), 1357, -3))); +} + +TEST_P(AsumUsmTests, ComplexDoublePrecision) { + EXPECT_TRUEORSKIP((test, double>(GetParam(), 1357, 2))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), 1357, 1))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), 1357, -3))); +} + +INSTANTIATE_TEST_SUITE_P(AsumUsmTestSuite, AsumUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level1/axpy.cpp b/tests/unit_tests/blas/level1/axpy.cpp index 950d6c16b..c5ec0a67a 100644 --- a/tests/unit_tests/blas/level1/axpy.cpp +++ b/tests/unit_tests/blas/level1/axpy.cpp @@ -42,7 +42,7 @@ extern std::vector devices; namespace { template -bool test(const device &dev, int N, int incx, int incy, fp alpha) { +int test(const device &dev, int N, int incx, int incy, fp alpha) { // Prepare data. vector x, y, y_ref; @@ -92,6 +92,14 @@ bool test(const device &dev, int N, int incx, int incy, fp alpha) { << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of AXPY:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -99,34 +107,34 @@ bool test(const device &dev, int N, int incx, int incy, fp alpha) { good = check_equal_vector(y_accessor, y_ref, N, incy, N, std::cout); } - return good; + return (int)good; } class AxpyTests : public ::testing::TestWithParam {}; TEST_P(AxpyTests, RealSinglePrecision) { float alpha(2.0); - EXPECT_TRUE(test(GetParam(), 1357, 2, 3, alpha)); - EXPECT_TRUE(test(GetParam(), 1357, 1, 1, alpha)); - EXPECT_TRUE(test(GetParam(), 1357, -3, -2, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -3, -2, alpha)); } TEST_P(AxpyTests, RealDoublePrecision) { double alpha(2.0); - EXPECT_TRUE(test(GetParam(), 1357, 2, 3, alpha)); - EXPECT_TRUE(test(GetParam(), 1357, 1, 1, alpha)); - EXPECT_TRUE(test(GetParam(), 1357, -3, -2, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -3, -2, alpha)); } TEST_P(AxpyTests, ComplexSinglePrecision) { std::complex alpha(2.0, -0.5); - EXPECT_TRUE(test>(GetParam(), 1357, 2, 3, alpha)); - EXPECT_TRUE(test>(GetParam(), 1357, 1, 1, alpha)); - EXPECT_TRUE(test>(GetParam(), 1357, -3, -2, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 2, 3, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 1, 1, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, -3, -2, alpha)); } TEST_P(AxpyTests, ComplexDoublePrecision) { std::complex alpha(2.0, -0.5); - EXPECT_TRUE(test>(GetParam(), 1357, 2, 3, alpha)); - EXPECT_TRUE(test>(GetParam(), 1357, 1, 1, alpha)); - EXPECT_TRUE(test>(GetParam(), 1357, -3, -2, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 2, 3, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 1, 1, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, -3, -2, alpha)); } INSTANTIATE_TEST_SUITE_P(AxpyTestSuite, AxpyTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level1/axpy_usm.cpp b/tests/unit_tests/blas/level1/axpy_usm.cpp new file mode 100644 index 000000000..a67779dff --- /dev/null +++ b/tests/unit_tests/blas/level1/axpy_usm.cpp @@ -0,0 +1,145 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device &dev, int N, int incx, int incy, fp alpha) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const &e) { + std::cout << "Caught asynchronous SYCL exception during AXPY:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), y(ua); + + rand_vector(x, N, incx); + rand_vector(y, N, incy); + + auto y_ref = y; + + // Call Reference AXPY. + using fp_ref = typename ref_type_info::type; + const int N_ref = N, incx_ref = incx, incy_ref = incy; + + ::axpy(&N_ref, (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y_ref.data(), + &incy_ref); + + // Call DPC++ AXPY. + + try { +#ifdef CALL_RT_API + done = + onemkl::blas::axpy(main_queue, N, alpha, x.data(), incx, y.data(), incy, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::axpy, + (main_queue, N, alpha, x.data(), incx, y.data(), incy, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const &e) { + std::cout << "Caught synchronous SYCL exception during AXPY:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of AXPY:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_vector(y, y_ref, N, incy, N, std::cout); + + return (int)good; +} + +class AxpyUsmTests : public ::testing::TestWithParam {}; + +TEST_P(AxpyUsmTests, RealSinglePrecision) { + float alpha(2.0); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -3, -2, alpha)); +} +TEST_P(AxpyUsmTests, RealDoublePrecision) { + double alpha(2.0); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -3, -2, alpha)); +} +TEST_P(AxpyUsmTests, ComplexSinglePrecision) { + std::complex alpha(2.0, -0.5); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 2, 3, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 1, 1, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, -3, -2, alpha)); +} +TEST_P(AxpyUsmTests, ComplexDoublePrecision) { + std::complex alpha(2.0, -0.5); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 2, 3, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 1, 1, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, -3, -2, alpha)); +} + +INSTANTIATE_TEST_SUITE_P(AxpyUsmTestSuite, AxpyUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level1/copy.cpp b/tests/unit_tests/blas/level1/copy.cpp index 4c25d9c43..fdba79042 100644 --- a/tests/unit_tests/blas/level1/copy.cpp +++ b/tests/unit_tests/blas/level1/copy.cpp @@ -42,7 +42,7 @@ extern std::vector devices; namespace { template -bool test(const device& dev, int N, int incx, int incy) { +int test(const device& dev, int N, int incx, int incy) { // Prepare data. vector x, y, y_ref; @@ -91,6 +91,14 @@ bool test(const device& dev, int N, int incx, int incy) { << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of COPY:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -98,30 +106,30 @@ bool test(const device& dev, int N, int incx, int incy) { good = check_equal_vector(y_accessor, y_ref, N, incy, N, std::cout); } - return good; + return (int)good; } class CopyTests : public ::testing::TestWithParam {}; TEST_P(CopyTests, RealSinglePrecision) { - EXPECT_TRUE(test(GetParam(), 1357, 2, 3)); - EXPECT_TRUE(test(GetParam(), 1357, 1, 1)); - EXPECT_TRUE(test(GetParam(), 1357, -3, -2)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -3, -2)); } TEST_P(CopyTests, RealDoublePrecision) { - EXPECT_TRUE(test(GetParam(), 1357, 2, 3)); - EXPECT_TRUE(test(GetParam(), 1357, 1, 1)); - EXPECT_TRUE(test(GetParam(), 1357, -3, -2)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -3, -2)); } TEST_P(CopyTests, ComplexSinglePrecision) { - EXPECT_TRUE(test>(GetParam(), 1357, 2, 3)); - EXPECT_TRUE(test>(GetParam(), 1357, 1, 1)); - EXPECT_TRUE(test>(GetParam(), 1357, -3, -2)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 2, 3)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 1, 1)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, -3, -2)); } TEST_P(CopyTests, ComplexDoublePrecision) { - EXPECT_TRUE(test>(GetParam(), 1357, 2, 3)); - EXPECT_TRUE(test>(GetParam(), 1357, 1, 1)); - EXPECT_TRUE(test>(GetParam(), 1357, -3, -2)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 2, 3)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 1, 1)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, -3, -2)); } INSTANTIATE_TEST_SUITE_P(CopyTestSuite, CopyTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level1/copy_usm.cpp b/tests/unit_tests/blas/level1/copy_usm.cpp new file mode 100644 index 000000000..5a528ebbb --- /dev/null +++ b/tests/unit_tests/blas/level1/copy_usm.cpp @@ -0,0 +1,139 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device& dev, int N, int incx, int incy) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const& e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const& e) { + std::cout << "Caught asynchronous SYCL exception during COPY:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), y(ua); + + rand_vector(x, N, incx); + rand_vector(y, N, incy); + + auto y_ref = y; + + // Call Reference COPY. + using fp_ref = typename ref_type_info::type; + const int N_ref = N, incx_ref = incx, incy_ref = incy; + + ::copy(&N_ref, (fp_ref*)x.data(), &incx_ref, (fp_ref*)y_ref.data(), &incy_ref); + + // Call DPC++ COPY. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::copy(main_queue, N, x.data(), incx, y.data(), incy, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::copy, + (main_queue, N, x.data(), incx, y.data(), incy, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const& e) { + std::cout << "Caught synchronous SYCL exception during COPY:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of COPY:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_vector(y, y_ref, N, incy, N, std::cout); + + return (int)good; +} + +class CopyUsmTests : public ::testing::TestWithParam {}; + +TEST_P(CopyUsmTests, RealSinglePrecision) { + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -3, -2)); +} +TEST_P(CopyUsmTests, RealDoublePrecision) { + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -3, -2)); +} +TEST_P(CopyUsmTests, ComplexSinglePrecision) { + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 2, 3)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 1, 1)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, -3, -2)); +} +TEST_P(CopyUsmTests, ComplexDoublePrecision) { + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 2, 3)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 1, 1)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, -3, -2)); +} + +INSTANTIATE_TEST_SUITE_P(CopyUsmTestSuite, CopyUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level1/dot.cpp b/tests/unit_tests/blas/level1/dot.cpp index fbf65a482..3603559b5 100644 --- a/tests/unit_tests/blas/level1/dot.cpp +++ b/tests/unit_tests/blas/level1/dot.cpp @@ -42,7 +42,7 @@ extern std::vector devices; namespace { template -bool test(const device& dev, int N, int incx, int incy) { +int test(const device& dev, int N, int incx, int incy) { // Prepare data. vector x, y; fp_res result = fp_res(-1), result_ref = fp_res(-1); @@ -91,6 +91,14 @@ bool test(const device& dev, int N, int incx, int incy) { << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of DOT:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -98,25 +106,25 @@ bool test(const device& dev, int N, int incx, int incy) { good = check_equal(result_accessor[0], result_ref, N, std::cout); } - return good; + return (int)good; } class DotTests : public ::testing::TestWithParam {}; TEST_P(DotTests, RealSinglePrecision) { - EXPECT_TRUE((test(GetParam(), 1357, 2, 3))); - EXPECT_TRUE((test(GetParam(), 1357, 1, 1))); - EXPECT_TRUE((test(GetParam(), 1357, -3, -2))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, 2, 3))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, 1, 1))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, -3, -2))); } TEST_P(DotTests, RealDoublePrecision) { - EXPECT_TRUE((test(GetParam(), 1357, 2, 3))); - EXPECT_TRUE((test(GetParam(), 1357, 1, 1))); - EXPECT_TRUE((test(GetParam(), 1357, -3, -2))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, 2, 3))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, 1, 1))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, -3, -2))); } //TEST_P(DotTests, RealDoubleSinglePrecision) { -// EXPECT_TRUE((test(GetParam(), 1357, 2, 3))); -// EXPECT_TRUE((test(GetParam(), 1357, 1, 1))); -// EXPECT_TRUE((test(GetParam(), 1357, -3, -2))); +// EXPECT_TRUEORSKIP((test(GetParam(), 1357, 2, 3))); +// EXPECT_TRUEORSKIP((test(GetParam(), 1357, 1, 1))); +// EXPECT_TRUEORSKIP((test(GetParam(), 1357, -3, -2))); //} INSTANTIATE_TEST_SUITE_P(DotTestSuite, DotTests, ::testing::ValuesIn(devices), ::DeviceNamePrint()); diff --git a/tests/unit_tests/blas/level1/dot_usm.cpp b/tests/unit_tests/blas/level1/dot_usm.cpp new file mode 100644 index 000000000..e5246df6f --- /dev/null +++ b/tests/unit_tests/blas/level1/dot_usm.cpp @@ -0,0 +1,136 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device& dev, int N, int incx, int incy) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const& e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const& e) { + std::cout << "Caught asynchronous SYCL exception during DOT:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), y(ua); + fp_res result_ref = fp_res(-1); + + rand_vector(x, N, incx); + rand_vector(y, N, incy); + + // Call Reference DOT. + const int N_ref = N, incx_ref = incx, incy_ref = incy; + + result_ref = ::dot(&N_ref, (fp*)x.data(), &incx_ref, (fp*)y.data(), &incy_ref); + + // Call DPC++ DOT. + + auto result_p = (fp_res*)onemkl::malloc_shared(64, sizeof(fp_res), dev, cxt); + + try { +#ifdef CALL_RT_API + done = onemkl::blas::dot(main_queue, N, x.data(), incx, y.data(), incy, result_p, + dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::dot, + (main_queue, N, x.data(), incx, y.data(), incy, result_p, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const& e) { + std::cout << "Caught synchronous SYCL exception during DOT:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of DOT:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal(*result_p, result_ref, N, std::cout); + + onemkl::free_shared(result_p, cxt); + return (int)good; +} + +class DotUsmTests : public ::testing::TestWithParam {}; + +TEST_P(DotUsmTests, RealSinglePrecision) { + EXPECT_TRUEORSKIP((test(GetParam(), 1357, 2, 3))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, 1, 1))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, -3, -2))); +} +TEST_P(DotUsmTests, RealDoublePrecision) { + EXPECT_TRUEORSKIP((test(GetParam(), 1357, 2, 3))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, 1, 1))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, -3, -2))); +} +//TEST_P(DotUsmTests, RealDoubleSinglePrecision) { +// EXPECT_TRUEORSKIP((test(GetParam(), 1357, 2, 3))); +// EXPECT_TRUEORSKIP((test(GetParam(), 1357, 1, 1))); +// EXPECT_TRUEORSKIP((test(GetParam(), 1357, -3, -2))); +//} + +INSTANTIATE_TEST_SUITE_P(DotUsmTestSuite, DotUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level1/dotc.cpp b/tests/unit_tests/blas/level1/dotc.cpp index c3cd2d88d..5cb69d1e2 100644 --- a/tests/unit_tests/blas/level1/dotc.cpp +++ b/tests/unit_tests/blas/level1/dotc.cpp @@ -42,7 +42,7 @@ extern std::vector devices; namespace { template -bool test(const device &dev, int N, int incx, int incy) { +int test(const device &dev, int N, int incx, int incy) { // Prepare data. vector x, y; fp result = 0.0, result_reference = 0.0; @@ -93,6 +93,14 @@ bool test(const device &dev, int N, int incx, int incy) { << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of DOTC:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -100,20 +108,20 @@ bool test(const device &dev, int N, int incx, int incy) { good = check_equal(result_accessor[0], result_reference, N, std::cout); } - return good; + return (int)good; } class DotcTests : public ::testing::TestWithParam {}; TEST_P(DotcTests, ComplexSinglePrecision) { - EXPECT_TRUE(test>(GetParam(), 1357, 2, 3)); - EXPECT_TRUE(test>(GetParam(), 1357, 1, 1)); - EXPECT_TRUE(test>(GetParam(), 1357, -3, -2)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 2, 3)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 1, 1)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, -3, -2)); } TEST_P(DotcTests, ComplexDoublePrecision) { - EXPECT_TRUE(test>(GetParam(), 1357, 2, 3)); - EXPECT_TRUE(test>(GetParam(), 1357, 1, 1)); - EXPECT_TRUE(test>(GetParam(), 1357, -3, -2)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 2, 3)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 1, 1)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, -3, -2)); } INSTANTIATE_TEST_SUITE_P(DotcTestSuite, DotcTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level1/dotc_usm.cpp b/tests/unit_tests/blas/level1/dotc_usm.cpp new file mode 100644 index 000000000..155565a1d --- /dev/null +++ b/tests/unit_tests/blas/level1/dotc_usm.cpp @@ -0,0 +1,134 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device &dev, int N, int incx, int incy) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const &e) { + std::cout << "Caught asynchronous SYCL exception during DOTC:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), y(ua); + fp result_reference = 0.0; + + rand_vector(x, N, incx); + rand_vector(y, N, incy); + + // Call Reference DOTC. + using fp_ref = typename ref_type_info::type; + const int N_ref = N, incx_ref = incx, incy_ref = incy; + + ::dotc((fp_ref *)&result_reference, &N_ref, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(), + &incy_ref); + + // Call DPC++ DOTC. + + auto result_p = (fp *)onemkl::malloc_shared(64, sizeof(fp), dev, cxt); + + try { +#ifdef CALL_RT_API + done = onemkl::blas::dotc(main_queue, N, x.data(), incx, y.data(), incy, result_p, + dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::dotc, + (main_queue, N, x.data(), incx, y.data(), incy, result_p, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const &e) { + std::cout << "Caught synchronous SYCL exception during DOTC:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of DOTC:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal(*result_p, result_reference, N, std::cout); + + onemkl::free_shared(result_p, cxt); + + return (int)good; +} + +class DotcUsmTests : public ::testing::TestWithParam {}; + +TEST_P(DotcUsmTests, ComplexSinglePrecision) { + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 2, 3)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 1, 1)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, -3, -2)); +} +TEST_P(DotcUsmTests, ComplexDoublePrecision) { + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 2, 3)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 1, 1)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, -3, -2)); +} + +INSTANTIATE_TEST_SUITE_P(DotcUsmTestSuite, DotcUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level1/dotu.cpp b/tests/unit_tests/blas/level1/dotu.cpp index f0aa503a8..f6237746c 100644 --- a/tests/unit_tests/blas/level1/dotu.cpp +++ b/tests/unit_tests/blas/level1/dotu.cpp @@ -42,7 +42,7 @@ extern std::vector devices; namespace { template -bool test(const device &dev, int N, int incx, int incy) { +int test(const device &dev, int N, int incx, int incy) { // Prepare data. vector x, y; fp result = 0.0, result_reference = 0.0; @@ -93,6 +93,14 @@ bool test(const device &dev, int N, int incx, int incy) { << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of DOTU:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -100,20 +108,20 @@ bool test(const device &dev, int N, int incx, int incy) { good = check_equal(result_accessor[0], result_reference, N, std::cout); } - return good; + return (int)good; } class DotuTests : public ::testing::TestWithParam {}; TEST_P(DotuTests, ComplexSinglePrecision) { - EXPECT_TRUE(test>(GetParam(), 1357, 2, 3)); - EXPECT_TRUE(test>(GetParam(), 1357, 1, 1)); - EXPECT_TRUE(test>(GetParam(), 1357, -3, -2)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 2, 3)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 1, 1)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, -3, -2)); } TEST_P(DotuTests, ComplexDoublePrecision) { - EXPECT_TRUE(test>(GetParam(), 1357, 2, 3)); - EXPECT_TRUE(test>(GetParam(), 1357, 1, 1)); - EXPECT_TRUE(test>(GetParam(), 1357, -3, -2)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 2, 3)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 1, 1)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, -3, -2)); } INSTANTIATE_TEST_SUITE_P(DotuTestSuite, DotuTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level1/dotu_usm.cpp b/tests/unit_tests/blas/level1/dotu_usm.cpp new file mode 100644 index 000000000..17658c56e --- /dev/null +++ b/tests/unit_tests/blas/level1/dotu_usm.cpp @@ -0,0 +1,133 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device &dev, int N, int incx, int incy) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const &e) { + std::cout << "Caught asynchronous SYCL exception during DOTU:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), y(ua); + fp result_reference = 0.0; + + rand_vector(x, N, incx); + rand_vector(y, N, incy); + + // Call Reference DOTU. + using fp_ref = typename ref_type_info::type; + const int N_ref = N, incx_ref = incx, incy_ref = incy; + + ::dotu((fp_ref *)&result_reference, &N_ref, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(), + &incy_ref); + + // Call DPC++ DOTU. + + auto result_p = (fp *)onemkl::malloc_shared(64, sizeof(fp), dev, cxt); + + try { +#ifdef CALL_RT_API + done = onemkl::blas::dotu(main_queue, N, x.data(), incx, y.data(), incy, result_p, + dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::dotu, + (main_queue, N, x.data(), incx, y.data(), incy, result_p, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const &e) { + std::cout << "Caught synchronous SYCL exception during DOTU:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of DOTU:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal(*result_p, result_reference, N, std::cout); + + onemkl::free_shared(result_p, cxt); + return (int)good; +} + +class DotuUsmTests : public ::testing::TestWithParam {}; + +TEST_P(DotuUsmTests, ComplexSinglePrecision) { + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 2, 3)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 1, 1)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, -3, -2)); +} +TEST_P(DotuUsmTests, ComplexDoublePrecision) { + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 2, 3)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 1, 1)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, -3, -2)); +} + +INSTANTIATE_TEST_SUITE_P(DotuUsmTestSuite, DotuUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level1/iamax.cpp b/tests/unit_tests/blas/level1/iamax.cpp index de85b9bd7..949322243 100644 --- a/tests/unit_tests/blas/level1/iamax.cpp +++ b/tests/unit_tests/blas/level1/iamax.cpp @@ -42,7 +42,7 @@ extern std::vector devices; namespace { template -bool test(const device& dev, int N, int incx) { +int test(const device& dev, int N, int incx) { // Prepare data. vector x; int64_t result = -1, result_ref = -1; @@ -89,6 +89,14 @@ bool test(const device& dev, int N, int incx) { << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of IAMAX:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -96,30 +104,30 @@ bool test(const device& dev, int N, int incx) { good = check_equal(result_accessor[0], result_ref, 0, std::cout); } - return good; + return (int)good; } class IamaxTests : public ::testing::TestWithParam {}; TEST_P(IamaxTests, RealSinglePrecision) { - EXPECT_TRUE(test(GetParam(), 1357, 2)); - EXPECT_TRUE(test(GetParam(), 1357, 1)); - EXPECT_TRUE(test(GetParam(), 1357, -3)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -3)); } TEST_P(IamaxTests, RealDoublePrecision) { - EXPECT_TRUE(test(GetParam(), 1357, 2)); - EXPECT_TRUE(test(GetParam(), 1357, 1)); - EXPECT_TRUE(test(GetParam(), 1357, -3)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -3)); } TEST_P(IamaxTests, ComplexSinglePrecision) { - EXPECT_TRUE(test>(GetParam(), 1357, 2)); - EXPECT_TRUE(test>(GetParam(), 1357, 1)); - EXPECT_TRUE(test>(GetParam(), 1357, -3)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 1)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, -3)); } TEST_P(IamaxTests, ComplexDoublePrecision) { - EXPECT_TRUE(test>(GetParam(), 1357, 2)); - EXPECT_TRUE(test>(GetParam(), 1357, 1)); - EXPECT_TRUE(test>(GetParam(), 1357, -3)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 1)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, -3)); } INSTANTIATE_TEST_SUITE_P(IamaxTestSuite, IamaxTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level1/iamax_usm.cpp b/tests/unit_tests/blas/level1/iamax_usm.cpp new file mode 100644 index 000000000..08b3d4fd5 --- /dev/null +++ b/tests/unit_tests/blas/level1/iamax_usm.cpp @@ -0,0 +1,139 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device& dev, int N, int incx) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const& e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const& e) { + std::cout << "Caught asynchronous SYCL exception during IAMAX:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua); + int64_t result_ref = -1; + rand_vector(x, N, incx); + + // Call Reference IAMAX. + using fp_ref = typename ref_type_info::type; + const int N_ref = N, incx_ref = incx; + + result_ref = ::iamax(&N_ref, (fp_ref*)x.data(), &incx_ref); + + // Call DPC++ IAMAX. + + auto result_p = (int64_t*)onemkl::malloc_shared(64, sizeof(int64_t), dev, cxt); + + try { +#ifdef CALL_RT_API + done = onemkl::blas::iamax(main_queue, N, x.data(), incx, result_p, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::iamax, + (main_queue, N, x.data(), incx, result_p, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const& e) { + std::cout << "Caught synchronous SYCL exception during IAMAX:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of IAMAX:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal(*result_p, result_ref, 0, std::cout); + + onemkl::free_shared(result_p, cxt); + return (int)good; +} + +class IamaxUsmTests : public ::testing::TestWithParam {}; + +TEST_P(IamaxUsmTests, RealSinglePrecision) { + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -3)); +} +TEST_P(IamaxUsmTests, RealDoublePrecision) { + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -3)); +} +TEST_P(IamaxUsmTests, ComplexSinglePrecision) { + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 1)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, -3)); +} +TEST_P(IamaxUsmTests, ComplexDoublePrecision) { + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 1)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, -3)); +} + +INSTANTIATE_TEST_SUITE_P(IamaxUsmTestSuite, IamaxUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level1/iamin.cpp b/tests/unit_tests/blas/level1/iamin.cpp index 57aa1bb89..1345b3dfa 100644 --- a/tests/unit_tests/blas/level1/iamin.cpp +++ b/tests/unit_tests/blas/level1/iamin.cpp @@ -42,7 +42,7 @@ extern std::vector devices; namespace { template -bool test(const device& dev, int N, int incx) { +int test(const device& dev, int N, int incx) { // Prepare data. vector x; int64_t result = -1, result_ref = -1; @@ -89,6 +89,14 @@ bool test(const device& dev, int N, int incx) { << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of IAMIN:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -96,30 +104,30 @@ bool test(const device& dev, int N, int incx) { good = check_equal(result_accessor[0], result_ref, 0, std::cout); } - return good; + return (int)good; } class IaminTests : public ::testing::TestWithParam {}; TEST_P(IaminTests, RealSinglePrecision) { - EXPECT_TRUE(test(GetParam(), 1357, 2)); - EXPECT_TRUE(test(GetParam(), 1357, 1)); - EXPECT_TRUE(test(GetParam(), 1357, -3)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -3)); } TEST_P(IaminTests, RealDoublePrecision) { - EXPECT_TRUE(test(GetParam(), 1357, 2)); - EXPECT_TRUE(test(GetParam(), 1357, 1)); - EXPECT_TRUE(test(GetParam(), 1357, -3)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -3)); } TEST_P(IaminTests, ComplexSinglePrecision) { - EXPECT_TRUE(test>(GetParam(), 1357, 2)); - EXPECT_TRUE(test>(GetParam(), 1357, 1)); - EXPECT_TRUE(test>(GetParam(), 1357, -3)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 1)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, -3)); } TEST_P(IaminTests, ComplexDoublePrecision) { - EXPECT_TRUE(test>(GetParam(), 1357, 2)); - EXPECT_TRUE(test>(GetParam(), 1357, 1)); - EXPECT_TRUE(test>(GetParam(), 1357, -3)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 1)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, -3)); } INSTANTIATE_TEST_SUITE_P(IaminTestSuite, IaminTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level1/iamin_usm.cpp b/tests/unit_tests/blas/level1/iamin_usm.cpp new file mode 100644 index 000000000..617efd6b6 --- /dev/null +++ b/tests/unit_tests/blas/level1/iamin_usm.cpp @@ -0,0 +1,139 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device& dev, int N, int incx) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const& e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const& e) { + std::cout << "Caught asynchronous SYCL exception during IAMIN:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua); + int64_t result_ref = -1; + rand_vector(x, N, incx); + + // Call Reference IAMIN. + using fp_ref = typename ref_type_info::type; + const int N_ref = N, incx_ref = incx; + + result_ref = ::iamin(&N_ref, (fp_ref*)x.data(), &incx_ref); + + // Call DPC++ IAMIN. + + auto result_p = (int64_t*)onemkl::malloc_shared(64, sizeof(int64_t), dev, cxt); + + try { +#ifdef CALL_RT_API + done = onemkl::blas::iamin(main_queue, N, x.data(), incx, result_p, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::iamin, + (main_queue, N, x.data(), incx, result_p, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const& e) { + std::cout << "Caught synchronous SYCL exception during IAMIN:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of IAMIN:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal(*result_p, result_ref, 0, std::cout); + + onemkl::free_shared(result_p, cxt); + return (int)good; +} + +class IaminUsmTests : public ::testing::TestWithParam {}; + +TEST_P(IaminUsmTests, RealSinglePrecision) { + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -3)); +} +TEST_P(IaminUsmTests, RealDoublePrecision) { + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -3)); +} +TEST_P(IaminUsmTests, ComplexSinglePrecision) { + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 1)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, -3)); +} +TEST_P(IaminUsmTests, ComplexDoublePrecision) { + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 1)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, -3)); +} + +INSTANTIATE_TEST_SUITE_P(IaminUsmTestSuite, IaminUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level1/nrm2.cpp b/tests/unit_tests/blas/level1/nrm2.cpp index ed2099b80..5e2ee3967 100644 --- a/tests/unit_tests/blas/level1/nrm2.cpp +++ b/tests/unit_tests/blas/level1/nrm2.cpp @@ -42,7 +42,7 @@ extern std::vector devices; namespace { template -bool test(const device& dev, int N, int incx) { +int test(const device& dev, int N, int incx) { // Prepare data. vector x; fp_res result = fp_res(-1), result_ref = fp_res(-1); @@ -89,6 +89,14 @@ bool test(const device& dev, int N, int incx) { << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of NRM2:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -96,30 +104,30 @@ bool test(const device& dev, int N, int incx) { good = check_equal(result_accessor[0], result_ref, N, std::cout); } - return good; + return (int)good; } class Nrm2Tests : public ::testing::TestWithParam {}; TEST_P(Nrm2Tests, RealSinglePrecision) { - EXPECT_TRUE((test(GetParam(), 1357, 2))); - EXPECT_TRUE((test(GetParam(), 1357, 1))); - EXPECT_TRUE((test(GetParam(), 1357, -3))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, 2))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, 1))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, -3))); } TEST_P(Nrm2Tests, RealDoublePrecision) { - EXPECT_TRUE((test(GetParam(), 1357, 2))); - EXPECT_TRUE((test(GetParam(), 1357, 1))); - EXPECT_TRUE((test(GetParam(), 1357, -3))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, 2))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, 1))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, -3))); } TEST_P(Nrm2Tests, ComplexSinglePrecision) { - EXPECT_TRUE((test, float>(GetParam(), 1357, 2))); - EXPECT_TRUE((test, float>(GetParam(), 1357, 1))); - EXPECT_TRUE((test, float>(GetParam(), 1357, -3))); + EXPECT_TRUEORSKIP((test, float>(GetParam(), 1357, 2))); + EXPECT_TRUEORSKIP((test, float>(GetParam(), 1357, 1))); + EXPECT_TRUEORSKIP((test, float>(GetParam(), 1357, -3))); } TEST_P(Nrm2Tests, ComplexDoublePrecision) { - EXPECT_TRUE((test, double>(GetParam(), 1357, 2))); - EXPECT_TRUE((test, double>(GetParam(), 1357, 1))); - EXPECT_TRUE((test, double>(GetParam(), 1357, -3))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), 1357, 2))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), 1357, 1))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), 1357, -3))); } INSTANTIATE_TEST_SUITE_P(Nrm2TestSuite, Nrm2Tests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level1/nrm2_usm.cpp b/tests/unit_tests/blas/level1/nrm2_usm.cpp new file mode 100644 index 000000000..4b82162ee --- /dev/null +++ b/tests/unit_tests/blas/level1/nrm2_usm.cpp @@ -0,0 +1,140 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device& dev, int N, int incx) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const& e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const& e) { + std::cout << "Caught asynchronous SYCL exception during NRM2:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua); + fp_res result_ref = fp_res(-1); + + rand_vector(x, N, incx); + + // Call Reference NRM2. + using fp_ref = typename ref_type_info::type; + const int N_ref = N, incx_ref = std::abs(incx); + + result_ref = ::nrm2(&N_ref, (fp_ref*)x.data(), &incx_ref); + + // Call DPC++ NRM2. + + auto result_p = (fp_res*)onemkl::malloc_shared(64, sizeof(fp_res), dev, cxt); + + try { +#ifdef CALL_RT_API + done = onemkl::blas::nrm2(main_queue, N, x.data(), incx, result_p, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::nrm2, + (main_queue, N, x.data(), incx, result_p, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const& e) { + std::cout << "Caught synchronous SYCL exception during NRM2:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of NRM2:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal(*result_p, result_ref, N, std::cout); + + onemkl::free_shared(result_p, cxt); + return (int)good; +} + +class Nrm2UsmTests : public ::testing::TestWithParam {}; + +TEST_P(Nrm2UsmTests, RealSinglePrecision) { + EXPECT_TRUEORSKIP((test(GetParam(), 1357, 2))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, 1))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, -3))); +} +TEST_P(Nrm2UsmTests, RealDoublePrecision) { + EXPECT_TRUEORSKIP((test(GetParam(), 1357, 2))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, 1))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, -3))); +} +TEST_P(Nrm2UsmTests, ComplexSinglePrecision) { + EXPECT_TRUEORSKIP((test, float>(GetParam(), 1357, 2))); + EXPECT_TRUEORSKIP((test, float>(GetParam(), 1357, 1))); + EXPECT_TRUEORSKIP((test, float>(GetParam(), 1357, -3))); +} +TEST_P(Nrm2UsmTests, ComplexDoublePrecision) { + EXPECT_TRUEORSKIP((test, double>(GetParam(), 1357, 2))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), 1357, 1))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), 1357, -3))); +} + +INSTANTIATE_TEST_SUITE_P(Nrm2UsmTestSuite, Nrm2UsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level1/rot.cpp b/tests/unit_tests/blas/level1/rot.cpp index b74229302..0a3c8c603 100644 --- a/tests/unit_tests/blas/level1/rot.cpp +++ b/tests/unit_tests/blas/level1/rot.cpp @@ -42,7 +42,7 @@ extern std::vector devices; namespace { template -bool test(const device &dev, int N, int incx, int incy, fp_scalar c, fp_scalar s) { +int test(const device &dev, int N, int incx, int incy, fp_scalar c, fp_scalar s) { // Prepare data. vector x, x_ref, y, y_ref; rand_vector(x, N, incx); @@ -92,6 +92,14 @@ bool test(const device &dev, int N, int incx, int incy, fp_scalar c, fp_scalar s << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of ROT:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -102,7 +110,7 @@ bool test(const device &dev, int N, int incx, int incy, fp_scalar c, fp_scalar s good = good_x && good_y; } - return good; + return (int)good; } class RotTests : public ::testing::TestWithParam {}; @@ -110,30 +118,30 @@ class RotTests : public ::testing::TestWithParam {}; TEST_P(RotTests, RealSinglePrecision) { float c(2.0); float s(-0.5); - EXPECT_TRUE((test(GetParam(), 1357, 2, 3, c, s))); - EXPECT_TRUE((test(GetParam(), 1357, 1, 1, c, s))); - EXPECT_TRUE((test(GetParam(), 1357, -2, -3, c, s))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, 2, 3, c, s))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, 1, 1, c, s))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, -2, -3, c, s))); } TEST_P(RotTests, RealDoublePrecision) { double c(2.0); double s(-0.5); - EXPECT_TRUE((test(GetParam(), 1357, 2, 3, c, s))); - EXPECT_TRUE((test(GetParam(), 1357, 1, 1, c, s))); - EXPECT_TRUE((test(GetParam(), 1357, -2, -3, c, s))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, 2, 3, c, s))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, 1, 1, c, s))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, -2, -3, c, s))); } TEST_P(RotTests, ComplexSinglePrecision) { float c = 2.0; float s = -0.5; - EXPECT_TRUE((test, float>(GetParam(), 1357, 2, 3, c, s))); - EXPECT_TRUE((test, float>(GetParam(), 1357, 1, 1, c, s))); - EXPECT_TRUE((test, float>(GetParam(), 1357, -2, -3, c, s))); + EXPECT_TRUEORSKIP((test, float>(GetParam(), 1357, 2, 3, c, s))); + EXPECT_TRUEORSKIP((test, float>(GetParam(), 1357, 1, 1, c, s))); + EXPECT_TRUEORSKIP((test, float>(GetParam(), 1357, -2, -3, c, s))); } TEST_P(RotTests, ComplexDoublePrecision) { double c = 2.0; double s = -0.5; - EXPECT_TRUE((test, double>(GetParam(), 1357, 2, 3, c, s))); - EXPECT_TRUE((test, double>(GetParam(), 1357, 1, 1, c, s))); - EXPECT_TRUE((test, double>(GetParam(), 1357, -2, -3, c, s))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), 1357, 2, 3, c, s))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), 1357, 1, 1, c, s))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), 1357, -2, -3, c, s))); } INSTANTIATE_TEST_SUITE_P(RotTestSuite, RotTests, ::testing::ValuesIn(devices), ::DeviceNamePrint()); diff --git a/tests/unit_tests/blas/level1/rot_usm.cpp b/tests/unit_tests/blas/level1/rot_usm.cpp new file mode 100644 index 000000000..fe1dd83a9 --- /dev/null +++ b/tests/unit_tests/blas/level1/rot_usm.cpp @@ -0,0 +1,150 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device &dev, int N, int incx, int incy, fp_scalar c, fp_scalar s) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const &e) { + std::cout << "Caught asynchronous SYCL exception during ROT:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), y(ua); + rand_vector(x, N, incx); + rand_vector(y, N, incy); + + auto x_ref = x; + auto y_ref = y; + + // Call Reference ROT. + using fp_ref = typename ref_type_info::type; + const int N_ref = N, incx_ref = incx, incy_ref = incy; + + ::rot(&N_ref, (fp_ref *)x_ref.data(), &incx_ref, (fp_ref *)y_ref.data(), &incy_ref, + (fp_scalar *)&c, (fp_scalar *)&s); + + // Call DPC++ ROT. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::rot(main_queue, N, x.data(), incx, y.data(), incy, c, s, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::rot, + (main_queue, N, x.data(), incx, y.data(), incy, c, s, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const &e) { + std::cout << "Caught synchronous SYCL exception during ROT:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of ROT:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good_x = check_equal_vector(x, x_ref, N, incx, N, std::cout); + bool good_y = check_equal_vector(y, y_ref, N, incy, N, std::cout); + bool good = good_x && good_y; + + return (int)good; +} + +class RotUsmTests : public ::testing::TestWithParam {}; + +TEST_P(RotUsmTests, RealSinglePrecision) { + float c(2.0); + float s(-0.5); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, 2, 3, c, s))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, 1, 1, c, s))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, -2, -3, c, s))); +} +TEST_P(RotUsmTests, RealDoublePrecision) { + double c(2.0); + double s(-0.5); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, 2, 3, c, s))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, 1, 1, c, s))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, -2, -3, c, s))); +} +TEST_P(RotUsmTests, ComplexSinglePrecision) { + float c = 2.0; + float s = -0.5; + EXPECT_TRUEORSKIP((test, float>(GetParam(), 1357, 2, 3, c, s))); + EXPECT_TRUEORSKIP((test, float>(GetParam(), 1357, 1, 1, c, s))); + EXPECT_TRUEORSKIP((test, float>(GetParam(), 1357, -2, -3, c, s))); +} +TEST_P(RotUsmTests, ComplexDoublePrecision) { + double c = 2.0; + double s = -0.5; + EXPECT_TRUEORSKIP((test, double>(GetParam(), 1357, 2, 3, c, s))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), 1357, 1, 1, c, s))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), 1357, -2, -3, c, s))); +} + +INSTANTIATE_TEST_SUITE_P(RotUsmTestSuite, RotUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level1/rotg.cpp b/tests/unit_tests/blas/level1/rotg.cpp index b71695157..f88d08fc4 100644 --- a/tests/unit_tests/blas/level1/rotg.cpp +++ b/tests/unit_tests/blas/level1/rotg.cpp @@ -42,15 +42,20 @@ extern std::vector devices; namespace { template -bool test(const device &dev, fp s, fp_scalar c) { +int test(const device &dev) { // Prepare data. - fp a, b, a_ref, b_ref, s_ref; - fp_scalar c_ref; + fp a, b, s, a_ref, b_ref, s_ref; + fp_scalar c, c_ref; + + a = rand_scalar(); + b = rand_scalar(); + s = rand_scalar(); + c = rand_scalar(); - a = rand_scalar(); - b = rand_scalar(); a_ref = a; b_ref = b; + s_ref = s; + c_ref = c; // Call Reference ROTG. using fp_ref = typename ref_type_info::type; @@ -94,6 +99,14 @@ bool test(const device &dev, fp s, fp_scalar c) { << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of ROTG:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -109,38 +122,30 @@ bool test(const device &dev, fp s, fp_scalar c) { good = good_a && good_b && good_c && good_s; } - return good; + return (int)good; } class RotgTests : public ::testing::TestWithParam {}; TEST_P(RotgTests, RealSinglePrecision) { - float c(2.0); - float s(-0.5); - EXPECT_TRUE((test(GetParam(), c, s))); - EXPECT_TRUE((test(GetParam(), c, s))); - EXPECT_TRUE((test(GetParam(), c, s))); + EXPECT_TRUEORSKIP((test(GetParam()))); + EXPECT_TRUEORSKIP((test(GetParam()))); + EXPECT_TRUEORSKIP((test(GetParam()))); } TEST_P(RotgTests, RealDoublePrecision) { - double c(2.0); - double s(-0.5); - EXPECT_TRUE((test(GetParam(), c, s))); - EXPECT_TRUE((test(GetParam(), c, s))); - EXPECT_TRUE((test(GetParam(), c, s))); + EXPECT_TRUEORSKIP((test(GetParam()))); + EXPECT_TRUEORSKIP((test(GetParam()))); + EXPECT_TRUEORSKIP((test(GetParam()))); } TEST_P(RotgTests, ComplexSinglePrecision) { - float c = 2.0; - float s = -0.5; - EXPECT_TRUE((test, float>(GetParam(), c, s))); - EXPECT_TRUE((test, float>(GetParam(), c, s))); - EXPECT_TRUE((test, float>(GetParam(), c, s))); + EXPECT_TRUEORSKIP((test, float>(GetParam()))); + EXPECT_TRUEORSKIP((test, float>(GetParam()))); + EXPECT_TRUEORSKIP((test, float>(GetParam()))); } TEST_P(RotgTests, ComplexDoublePrecision) { - double c = 2.0; - double s = -0.5; - EXPECT_TRUE((test, double>(GetParam(), c, s))); - EXPECT_TRUE((test, double>(GetParam(), c, s))); - EXPECT_TRUE((test, double>(GetParam(), c, s))); + EXPECT_TRUEORSKIP((test, double>(GetParam()))); + EXPECT_TRUEORSKIP((test, double>(GetParam()))); + EXPECT_TRUEORSKIP((test, double>(GetParam()))); } INSTANTIATE_TEST_SUITE_P(RotgTestSuite, RotgTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level1/rotg_usm.cpp b/tests/unit_tests/blas/level1/rotg_usm.cpp new file mode 100644 index 000000000..aa399c77a --- /dev/null +++ b/tests/unit_tests/blas/level1/rotg_usm.cpp @@ -0,0 +1,158 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device &dev) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const &e) { + std::cout << "Caught asynchronous SYCL exception during ROTG:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + + fp a, b, s, a_ref, b_ref, s_ref; + fp_scalar c, c_ref; + + a = rand_scalar(); + b = rand_scalar(); + s = rand_scalar(); + c = rand_scalar(); + a_ref = a; + b_ref = b; + + // Call Reference ROTG. + using fp_ref = typename ref_type_info::type; + + ::rotg((fp_ref *)&a_ref, (fp_ref *)&b_ref, (fp_scalar *)&c_ref, (fp_ref *)&s_ref); + + // Call DPC++ ROTG. + fp *a_p = (fp *)onemkl::malloc_shared(64, sizeof(fp), dev, cxt); + fp *b_p = (fp *)onemkl::malloc_shared(64, sizeof(fp), dev, cxt); + fp *s_p = (fp *)onemkl::malloc_shared(64, sizeof(fp), dev, cxt); + fp_scalar *c_p = (fp_scalar *)onemkl::malloc_shared(64, sizeof(fp), dev, cxt); + + a_p[0] = a; + b_p[0] = b; + s_p[0] = s; + c_p[0] = c; + + try { +#ifdef CALL_RT_API + done = onemkl::blas::rotg(main_queue, a_p, b_p, c_p, s_p, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::rotg, (main_queue, a_p, b_p, c_p, s_p, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const &e) { + std::cout << "Caught synchronous SYCL exception during ROTG:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of ROTG:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good_a = check_equal(a_p[0], a_ref, 4, std::cout); + bool good_b = check_equal(b_p[0], b_ref, 4, std::cout); + bool good_s = check_equal(s_p[0], s_ref, 4, std::cout); + bool good_c = check_equal(c_p[0], c_ref, 4, std::cout); + + bool good = good_a && good_b && good_c && good_s; + + onemkl::free_shared(a_p, cxt); + onemkl::free_shared(b_p, cxt); + onemkl::free_shared(s_p, cxt); + onemkl::free_shared(c_p, cxt); + return (int)good; +} + +class RotgUsmTests : public ::testing::TestWithParam {}; + +TEST_P(RotgUsmTests, RealSinglePrecision) { + EXPECT_TRUEORSKIP((test(GetParam()))); + EXPECT_TRUEORSKIP((test(GetParam()))); + EXPECT_TRUEORSKIP((test(GetParam()))); +} +TEST_P(RotgUsmTests, RealDoublePrecision) { + EXPECT_TRUEORSKIP((test(GetParam()))); + EXPECT_TRUEORSKIP((test(GetParam()))); + EXPECT_TRUEORSKIP((test(GetParam()))); +} +TEST_P(RotgUsmTests, ComplexSinglePrecision) { + EXPECT_TRUEORSKIP((test, float>(GetParam()))); + EXPECT_TRUEORSKIP((test, float>(GetParam()))); + EXPECT_TRUEORSKIP((test, float>(GetParam()))); +} +TEST_P(RotgUsmTests, ComplexDoublePrecision) { + EXPECT_TRUEORSKIP((test, double>(GetParam()))); + EXPECT_TRUEORSKIP((test, double>(GetParam()))); + EXPECT_TRUEORSKIP((test, double>(GetParam()))); +} + +INSTANTIATE_TEST_SUITE_P(RotgUsmTestSuite, RotgUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level1/rotm.cpp b/tests/unit_tests/blas/level1/rotm.cpp index 50af06d66..8c33e6333 100644 --- a/tests/unit_tests/blas/level1/rotm.cpp +++ b/tests/unit_tests/blas/level1/rotm.cpp @@ -42,7 +42,7 @@ extern std::vector devices; namespace { template -bool test(const device &dev, int N, int incx, int incy, fp flag) { +int test(const device &dev, int N, int incx, int incy, fp flag) { // Prepare data. vector x, x_ref, y, y_ref; vector param; @@ -96,6 +96,14 @@ bool test(const device &dev, int N, int incx, int incy, fp flag) { << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of ROTM:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -106,46 +114,46 @@ bool test(const device &dev, int N, int incx, int incy, fp flag) { good = good_x && good_y; } - return good; + return (int)good; } class RotmTests : public ::testing::TestWithParam {}; TEST_P(RotmTests, RealSinglePrecision) { float flag(-1.0); - EXPECT_TRUE(test(GetParam(), 1357, 2, 3, flag)); - EXPECT_TRUE(test(GetParam(), 1357, -2, -3, flag)); - EXPECT_TRUE(test(GetParam(), 1357, 1, 1, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -2, -3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1, flag)); flag = 0.0; - EXPECT_TRUE(test(GetParam(), 1357, 2, 3, flag)); - EXPECT_TRUE(test(GetParam(), 1357, -2, -3, flag)); - EXPECT_TRUE(test(GetParam(), 1357, 1, 1, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -2, -3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1, flag)); flag = 1.0; - EXPECT_TRUE(test(GetParam(), 1357, 2, 3, flag)); - EXPECT_TRUE(test(GetParam(), 1357, -2, -3, flag)); - EXPECT_TRUE(test(GetParam(), 1357, 1, 1, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -2, -3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1, flag)); flag = -2.0; - EXPECT_TRUE(test(GetParam(), 1357, 2, 3, flag)); - EXPECT_TRUE(test(GetParam(), 1357, -2, -3, flag)); - EXPECT_TRUE(test(GetParam(), 1357, 1, 1, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -2, -3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1, flag)); } TEST_P(RotmTests, RealDoublePrecision) { double flag(-1.0); - EXPECT_TRUE(test(GetParam(), 1357, 2, 3, flag)); - EXPECT_TRUE(test(GetParam(), 1357, -2, -3, flag)); - EXPECT_TRUE(test(GetParam(), 1357, 1, 1, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -2, -3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1, flag)); flag = 0.0; - EXPECT_TRUE(test(GetParam(), 1357, 2, 3, flag)); - EXPECT_TRUE(test(GetParam(), 1357, -2, -3, flag)); - EXPECT_TRUE(test(GetParam(), 1357, 1, 1, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -2, -3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1, flag)); flag = 1.0; - EXPECT_TRUE(test(GetParam(), 1357, 2, 3, flag)); - EXPECT_TRUE(test(GetParam(), 1357, -2, -3, flag)); - EXPECT_TRUE(test(GetParam(), 1357, 1, 1, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -2, -3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1, flag)); flag = -2.0; - EXPECT_TRUE(test(GetParam(), 1357, 2, 3, flag)); - EXPECT_TRUE(test(GetParam(), 1357, -2, -3, flag)); - EXPECT_TRUE(test(GetParam(), 1357, 1, 1, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -2, -3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1, flag)); } INSTANTIATE_TEST_SUITE_P(RotmTestSuite, RotmTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level1/rotm_usm.cpp b/tests/unit_tests/blas/level1/rotm_usm.cpp new file mode 100644 index 000000000..0c438c383 --- /dev/null +++ b/tests/unit_tests/blas/level1/rotm_usm.cpp @@ -0,0 +1,161 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device &dev, int N, int incx, int incy, fp flag) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const &e) { + std::cout << "Caught asynchronous SYCL exception during ROTM:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), y(ua), param(ua); + rand_vector(x, N, incx); + rand_vector(y, N, incy); + rand_vector(param, 5, 1); + param[0] = flag; + + auto x_ref = x; + auto y_ref = y; + + // Call Reference ROTM. + using fp_ref = typename ref_type_info::type; + const int N_ref = N, incx_ref = incx, incy_ref = incy; + + ::rotm(&N_ref, (fp_ref *)x_ref.data(), &incx_ref, (fp_ref *)y_ref.data(), &incy_ref, + (fp_ref *)param.data()); + + // Call DPC++ ROTM. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::rotm(main_queue, N, x.data(), incx, y.data(), incy, param.data(), + dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::rotm, + (main_queue, N, x.data(), incx, y.data(), incy, param.data(), dependencies)); + main_queue.wait(); +#endif + } + catch (exception const &e) { + std::cout << "Caught synchronous SYCL exception during ROTM:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of ROTM:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good_x = check_equal_vector(x, x_ref, N, incx, N, std::cout); + bool good_y = check_equal_vector(y, y_ref, N, incy, N, std::cout); + bool good = good_x && good_y; + + return (int)good; +} + +class RotmUsmTests : public ::testing::TestWithParam {}; + +TEST_P(RotmUsmTests, RealSinglePrecision) { + float flag(-1.0); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -2, -3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1, flag)); + flag = 0.0; + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -2, -3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1, flag)); + flag = 1.0; + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -2, -3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1, flag)); + flag = -2.0; + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -2, -3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1, flag)); +} +TEST_P(RotmUsmTests, RealDoublePrecision) { + double flag(-1.0); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -2, -3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1, flag)); + flag = 0.0; + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -2, -3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1, flag)); + flag = 1.0; + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -2, -3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1, flag)); + flag = -2.0; + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -2, -3, flag)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1, flag)); +} + +INSTANTIATE_TEST_SUITE_P(RotmUsmTestSuite, RotmUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level1/rotmg.cpp b/tests/unit_tests/blas/level1/rotmg.cpp index 8bbe5a13d..90e907d3b 100644 --- a/tests/unit_tests/blas/level1/rotmg.cpp +++ b/tests/unit_tests/blas/level1/rotmg.cpp @@ -42,7 +42,7 @@ extern std::vector devices; namespace { template -bool test(const device& dev) { +int test(const device& dev) { // Prepare data. fp d1, d2, x1, y1, d1_ref, d2_ref, x1_ref; vector param(5, fp(0)), param_ref(5, fp(0)); @@ -96,6 +96,14 @@ bool test(const device& dev) { << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of ROTMG:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -110,16 +118,16 @@ bool test(const device& dev) { good = good_d1 && good_d2 && good_x1 && good_param; } - return good; + return (int)good; } class RotmgTests : public ::testing::TestWithParam {}; TEST_P(RotmgTests, RealSinglePrecision) { - EXPECT_TRUE(test(GetParam())); + EXPECT_TRUEORSKIP(test(GetParam())); } TEST_P(RotmgTests, RealDoublePrecision) { - EXPECT_TRUE(test(GetParam())); + EXPECT_TRUEORSKIP(test(GetParam())); } INSTANTIATE_TEST_SUITE_P(RotmgTestSuite, RotmgTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level1/rotmg_usm.cpp b/tests/unit_tests/blas/level1/rotmg_usm.cpp new file mode 100644 index 000000000..86585b831 --- /dev/null +++ b/tests/unit_tests/blas/level1/rotmg_usm.cpp @@ -0,0 +1,141 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device &dev) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const &e) { + std::cout << "Caught asynchronous SYCL exception during ROTMG:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector param(5, fp(0), ua), param_ref(5, fp(0), ua); + fp d1, d2, x1, y1, d1_ref, d2_ref, x1_ref; + + d1 = rand_scalar(); + d1 = abs(d1); + d2 = rand_scalar(); + x1 = rand_scalar(); + y1 = rand_scalar(); + d1_ref = d1; + d2_ref = d2; + x1_ref = x1; + + // Call Reference ROTMG. + + ::rotmg(&d1_ref, &d2_ref, &x1_ref, &y1, (fp *)param_ref.data()); + + // Call DPC++ ROTMG. + fp *d1_p = (fp *)onemkl::malloc_shared(64, sizeof(fp), dev, cxt); + fp *d2_p = (fp *)onemkl::malloc_shared(64, sizeof(fp), dev, cxt); + fp *x1_p = (fp *)onemkl::malloc_shared(64, sizeof(fp), dev, cxt); + d1_p[0] = d1; + d2_p[0] = d2; + x1_p[0] = x1; + + try { +#ifdef CALL_RT_API + done = onemkl::blas::rotmg(main_queue, d1_p, d2_p, x1_p, y1, param.data(), dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::rotmg, + (main_queue, d1_p, d2_p, x1_p, y1, param.data(), dependencies)); + main_queue.wait(); +#endif + } + catch (exception const &e) { + std::cout << "Caught synchronous SYCL exception during ROTMG:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of ROTMG:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good_d1 = check_equal(d1_p[0], d1_ref, 1, std::cout); + bool good_d2 = check_equal(d2_p[0], d2_ref, 1, std::cout); + bool good_x1 = check_equal(x1_p[0], x1_ref, 1, std::cout); + bool good_param = check_equal_vector(param, param_ref, 5, 1, 1, std::cout); + bool good = good_d1 && good_d2 && good_x1 && good_param; + + onemkl::free_shared(d1_p, cxt); + onemkl::free_shared(d2_p, cxt); + onemkl::free_shared(x1_p, cxt); + return (int)good; +} + +class RotmgUsmTests : public ::testing::TestWithParam {}; + +TEST_P(RotmgUsmTests, RealSinglePrecision) { + EXPECT_TRUEORSKIP(test(GetParam())); +} +TEST_P(RotmgUsmTests, RealDoublePrecision) { + EXPECT_TRUEORSKIP(test(GetParam())); +} + +INSTANTIATE_TEST_SUITE_P(RotmgUsmTestSuite, RotmgUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level1/scal.cpp b/tests/unit_tests/blas/level1/scal.cpp index 3c384dde9..4c375c0ba 100644 --- a/tests/unit_tests/blas/level1/scal.cpp +++ b/tests/unit_tests/blas/level1/scal.cpp @@ -42,7 +42,7 @@ extern std::vector devices; namespace { template -bool test(const device& dev, int N, int incx, fp_scalar alpha) { +int test(const device& dev, int N, int incx, fp_scalar alpha) { // Prepare data. vector x, x_ref; @@ -90,6 +90,14 @@ bool test(const device& dev, int N, int incx, fp_scalar alpha) { << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of SCAL:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -97,40 +105,43 @@ bool test(const device& dev, int N, int incx, fp_scalar alpha) { good = check_equal_vector(x_accessor, x_ref, N, incx, N, std::cout); } - return good; + return (int)good; } class ScalTests : public ::testing::TestWithParam {}; TEST_P(ScalTests, RealSinglePrecision) { float alpha(2.0); - EXPECT_TRUE((test(GetParam(), 1357, 2, alpha))); - EXPECT_TRUE((test(GetParam(), 1357, -3, alpha))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, 2, alpha))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, -3, alpha))); } TEST_P(ScalTests, RealDoublePrecision) { double alpha(2.0); - EXPECT_TRUE((test(GetParam(), 1357, 2, alpha))); - EXPECT_TRUE((test(GetParam(), 1357, -3, alpha))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, 2, alpha))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, -3, alpha))); } TEST_P(ScalTests, ComplexSinglePrecision) { std::complex alpha(2.0, -0.5); - EXPECT_TRUE((test, std::complex>(GetParam(), 1357, 2, alpha))); - EXPECT_TRUE((test, std::complex>(GetParam(), 1357, -3, alpha))); + EXPECT_TRUEORSKIP((test, std::complex>(GetParam(), 1357, 2, alpha))); + EXPECT_TRUEORSKIP( + (test, std::complex>(GetParam(), 1357, -3, alpha))); } TEST_P(ScalTests, ComplexDoublePrecision) { std::complex alpha(2.0, -0.5); - EXPECT_TRUE((test, std::complex>(GetParam(), 1357, 2, alpha))); - EXPECT_TRUE((test, std::complex>(GetParam(), 1357, -3, alpha))); + EXPECT_TRUEORSKIP( + (test, std::complex>(GetParam(), 1357, 2, alpha))); + EXPECT_TRUEORSKIP( + (test, std::complex>(GetParam(), 1357, -3, alpha))); } TEST_P(ScalTests, ComplexRealSinglePrecision) { float alpha(2.0); - EXPECT_TRUE((test, float>(GetParam(), 1357, 2, alpha))); - EXPECT_TRUE((test, float>(GetParam(), 1357, -3, alpha))); + EXPECT_TRUEORSKIP((test, float>(GetParam(), 1357, 2, alpha))); + EXPECT_TRUEORSKIP((test, float>(GetParam(), 1357, -3, alpha))); } TEST_P(ScalTests, ComplexRealDoublePrecision) { double alpha(2.0); - EXPECT_TRUE((test, double>(GetParam(), 1357, 2, alpha))); - EXPECT_TRUE((test, double>(GetParam(), 1357, -3, alpha))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), 1357, 2, alpha))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), 1357, -3, alpha))); } INSTANTIATE_TEST_SUITE_P(ScalTestSuite, ScalTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level1/scal_usm.cpp b/tests/unit_tests/blas/level1/scal_usm.cpp new file mode 100644 index 000000000..f5df7913b --- /dev/null +++ b/tests/unit_tests/blas/level1/scal_usm.cpp @@ -0,0 +1,153 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device& dev, int N, int incx, fp_scalar alpha) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const& e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const& e) { + std::cout << "Caught asynchronous SYCL exception during SCAL:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua); + + rand_vector(x, N, incx); + + auto x_ref = x; + + // Call Reference SCAL. + using fp_ref = typename ref_type_info::type; + using fp_scalar_mkl = typename ref_type_info::type; + + const int N_ref = N, incx_ref = std::abs(incx); + + ::scal(&N_ref, (fp_scalar_mkl*)&alpha, (fp_ref*)x_ref.data(), &incx_ref); + + // Call DPC++ SCAL. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::scal(main_queue, N, alpha, x.data(), incx, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::scal, + (main_queue, N, alpha, x.data(), incx, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const& e) { + std::cout << "Caught synchronous SYCL exception during SCAL:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of SCAL:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_vector(x, x_ref, N, incx, N, std::cout); + + return (int)good; +} + +class ScalUsmTests : public ::testing::TestWithParam {}; + +TEST_P(ScalUsmTests, RealSinglePrecision) { + float alpha(2.0); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, 2, alpha))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, -3, alpha))); +} +TEST_P(ScalUsmTests, RealDoublePrecision) { + double alpha(2.0); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, 2, alpha))); + EXPECT_TRUEORSKIP((test(GetParam(), 1357, -3, alpha))); +} +TEST_P(ScalUsmTests, ComplexSinglePrecision) { + std::complex alpha(2.0, -0.5); + EXPECT_TRUEORSKIP((test, std::complex>(GetParam(), 1357, 2, alpha))); + EXPECT_TRUEORSKIP( + (test, std::complex>(GetParam(), 1357, -3, alpha))); +} +TEST_P(ScalUsmTests, ComplexDoublePrecision) { + std::complex alpha(2.0, -0.5); + EXPECT_TRUEORSKIP( + (test, std::complex>(GetParam(), 1357, 2, alpha))); + EXPECT_TRUEORSKIP( + (test, std::complex>(GetParam(), 1357, -3, alpha))); +} +TEST_P(ScalUsmTests, ComplexRealSinglePrecision) { + float alpha(2.0); + EXPECT_TRUEORSKIP((test, float>(GetParam(), 1357, 2, alpha))); + EXPECT_TRUEORSKIP((test, float>(GetParam(), 1357, -3, alpha))); +} +TEST_P(ScalUsmTests, ComplexRealDoublePrecision) { + double alpha(2.0); + EXPECT_TRUEORSKIP((test, double>(GetParam(), 1357, 2, alpha))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), 1357, -3, alpha))); +} + +INSTANTIATE_TEST_SUITE_P(ScalUsmTestSuite, ScalUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level1/sdsdot.cpp b/tests/unit_tests/blas/level1/sdsdot.cpp index a6720a3e9..a5bdc9c89 100644 --- a/tests/unit_tests/blas/level1/sdsdot.cpp +++ b/tests/unit_tests/blas/level1/sdsdot.cpp @@ -41,7 +41,7 @@ extern std::vector devices; namespace { -bool test(const device &dev, int N, int incx, int incy, float alpha) { +int test(const device &dev, int N, int incx, int incy, float alpha) { // Prepare data. vector x, y; float result = float(-1), result_ref = float(-1); @@ -91,6 +91,14 @@ bool test(const device &dev, int N, int incx, int incy, float alpha) { << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of SDSDOT:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -98,15 +106,15 @@ bool test(const device &dev, int N, int incx, int incy, float alpha) { good = check_equal(result_accessor[0], result_ref, N, std::cout); } - return good; + return (int)good; } class SdsdotTests : public ::testing::TestWithParam {}; TEST_P(SdsdotTests, RealSinglePrecision) { - EXPECT_TRUE(test(GetParam(), 1357, 2, 3, 2.0)); - EXPECT_TRUE(test(GetParam(), 1357, -2, -3, 2.0)); - EXPECT_TRUE(test(GetParam(), 1357, 1, 1, 2.0)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3, 2.0)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -2, -3, 2.0)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1, 2.0)); } INSTANTIATE_TEST_SUITE_P(SdsdotTestSuite, SdsdotTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level1/sdsdot_usm.cpp b/tests/unit_tests/blas/level1/sdsdot_usm.cpp new file mode 100644 index 000000000..a93a3d711 --- /dev/null +++ b/tests/unit_tests/blas/level1/sdsdot_usm.cpp @@ -0,0 +1,126 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +int test(const device &dev, int N, int incx, int incy, float alpha) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const &e) { + std::cout << "Caught asynchronous SYCL exception during SDSDOT:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), y(ua); + float result_ref = float(-1); + + rand_vector(x, N, incx); + rand_vector(y, N, incy); + + // Call Reference SDSDOT. + const int N_ref = N, incx_ref = incx, incy_ref = incy; + + result_ref = ::sdsdot(&N_ref, (float *)&alpha, (float *)x.data(), &incx_ref, (float *)y.data(), + &incy_ref); + + // Call DPC++ SDSDOT. + + auto result_p = (float *)onemkl::malloc_shared(64, sizeof(float), dev, cxt); + + try { +#ifdef CALL_RT_API + done = onemkl::blas::sdsdot(main_queue, N, alpha, x.data(), incx, y.data(), incy, result_p, + dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::sdsdot, + (main_queue, N, alpha, x.data(), incx, y.data(), incy, result_p, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const &e) { + std::cout << "Caught synchronous SYCL exception during SDSDOT:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of SDSDOT:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal(*result_p, result_ref, N, std::cout); + + onemkl::free_shared(result_p, cxt); + return (int)good; +} + +class SdsdotUsmTests : public ::testing::TestWithParam {}; + +TEST_P(SdsdotUsmTests, RealSinglePrecision) { + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3, 2.0)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -2, -3, 2.0)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1, 2.0)); +} + +INSTANTIATE_TEST_SUITE_P(SdsdotUsmTestSuite, SdsdotUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level1/swap.cpp b/tests/unit_tests/blas/level1/swap.cpp index 0fd2407d2..b35293ea1 100644 --- a/tests/unit_tests/blas/level1/swap.cpp +++ b/tests/unit_tests/blas/level1/swap.cpp @@ -42,7 +42,7 @@ extern std::vector devices; namespace { template -bool test(const device& dev, int N, int incx, int incy) { +int test(const device& dev, int N, int incx, int incy) { // Prepare data. vector x, x_ref, y, y_ref; rand_vector(x, N, incx); @@ -91,6 +91,14 @@ bool test(const device& dev, int N, int incx, int incy) { << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of SWAP:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -101,30 +109,30 @@ bool test(const device& dev, int N, int incx, int incy) { good = good_x && good_y; } - return good; + return (int)good; } class SwapTests : public ::testing::TestWithParam {}; TEST_P(SwapTests, RealSinglePrecision) { - EXPECT_TRUE(test(GetParam(), 1357, 2, 3)); - EXPECT_TRUE(test(GetParam(), 1357, -2, -3)); - EXPECT_TRUE(test(GetParam(), 1357, 1, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -2, -3)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1)); } TEST_P(SwapTests, RealDoublePrecision) { - EXPECT_TRUE(test(GetParam(), 1357, 2, 3)); - EXPECT_TRUE(test(GetParam(), 1357, -2, -3)); - EXPECT_TRUE(test(GetParam(), 1357, 1, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -2, -3)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1)); } TEST_P(SwapTests, ComplexSinglePrecision) { - EXPECT_TRUE(test>(GetParam(), 1357, 2, 3)); - EXPECT_TRUE(test>(GetParam(), 1357, -2, -3)); - EXPECT_TRUE(test>(GetParam(), 1357, 1, 1)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 2, 3)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, -2, -3)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 1, 1)); } TEST_P(SwapTests, ComplexDoublePrecision) { - EXPECT_TRUE(test>(GetParam(), 1357, 2, 3)); - EXPECT_TRUE(test>(GetParam(), 1357, -2, -3)); - EXPECT_TRUE(test>(GetParam(), 1357, 1, 1)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 2, 3)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, -2, -3)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 1, 1)); } INSTANTIATE_TEST_SUITE_P(SwapTestSuite, SwapTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level1/swap_usm.cpp b/tests/unit_tests/blas/level1/swap_usm.cpp new file mode 100644 index 000000000..ad51f4acd --- /dev/null +++ b/tests/unit_tests/blas/level1/swap_usm.cpp @@ -0,0 +1,141 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device& dev, int N, int incx, int incy) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const& e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const& e) { + std::cout << "Caught asynchronous SYCL exception during SWAP:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), y(ua); + rand_vector(x, N, incx); + rand_vector(y, N, incy); + + auto x_ref = x; + auto y_ref = y; + + // Call Reference SWAP. + using fp_ref = typename ref_type_info::type; + const int N_ref = N, incx_ref = incx, incy_ref = incy; + + ::swap(&N_ref, (fp_ref*)x_ref.data(), &incx_ref, (fp_ref*)y_ref.data(), &incy_ref); + + // Call DPC++ SWAP. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::swap(main_queue, N, x.data(), incx, y.data(), incy, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::swap, + (main_queue, N, x.data(), incx, y.data(), incy, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const& e) { + std::cout << "Caught synchronous SYCL exception during SWAP:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of SWAP:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good_y = check_equal_vector(y, y_ref, N, incy, N, std::cout); + bool good_x = check_equal_vector(x, x_ref, N, incx, N, std::cout); + bool good = good_x && good_y; + + return (int)good; +} + +class SwapUsmTests : public ::testing::TestWithParam {}; + +TEST_P(SwapUsmTests, RealSinglePrecision) { + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -2, -3)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1)); +} +TEST_P(SwapUsmTests, RealDoublePrecision) { + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 2, 3)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, -2, -3)); + EXPECT_TRUEORSKIP(test(GetParam(), 1357, 1, 1)); +} +TEST_P(SwapUsmTests, ComplexSinglePrecision) { + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 2, 3)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, -2, -3)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 1, 1)); +} +TEST_P(SwapUsmTests, ComplexDoublePrecision) { + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 2, 3)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, -2, -3)); + EXPECT_TRUEORSKIP(test>(GetParam(), 1357, 1, 1)); +} + +INSTANTIATE_TEST_SUITE_P(SwapUsmTestSuite, SwapUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level2/CMakeLists.txt b/tests/unit_tests/blas/level2/CMakeLists.txt index 7e2db862b..4facc1456 100644 --- a/tests/unit_tests/blas/level2/CMakeLists.txt +++ b/tests/unit_tests/blas/level2/CMakeLists.txt @@ -18,7 +18,7 @@ #=============================================================================== # Build object from all test sources -set(L2_SOURCES "hpr2.cpp" "hpmv.cpp" "her.cpp" "her2.cpp" "hemv.cpp" "hbmv.cpp" "geru.cpp" "ger.cpp" "gerc.cpp" "gemv.cpp" "gbmv.cpp" "trsv.cpp" "trmv.cpp" "tpsv.cpp" "tpmv.cpp" "tbsv.cpp" "tbmv.cpp" "syr.cpp" "syr2.cpp" "symv.cpp" "spr.cpp" "spr2.cpp" "spmv.cpp" "sbmv.cpp" "hpr.cpp") +set(L2_SOURCES "hpr2.cpp" "hpmv.cpp" "her.cpp" "her2.cpp" "hemv.cpp" "hbmv.cpp" "geru.cpp" "ger.cpp" "gerc.cpp" "gemv.cpp" "gbmv.cpp" "trsv.cpp" "trmv.cpp" "tpsv.cpp" "tpmv.cpp" "tbsv.cpp" "tbmv.cpp" "syr.cpp" "syr2.cpp" "symv.cpp" "spr.cpp" "spr2.cpp" "spmv.cpp" "sbmv.cpp" "hpr.cpp" "hpr2_usm.cpp" "hpmv_usm.cpp" "her_usm.cpp" "her2_usm.cpp" "hemv_usm.cpp" "hbmv_usm.cpp" "geru_usm.cpp" "ger_usm.cpp" "gerc_usm.cpp" "gemv_usm.cpp" "gbmv_usm.cpp" "trsv_usm.cpp" "trmv_usm.cpp" "tpsv_usm.cpp" "tpmv_usm.cpp" "tbsv_usm.cpp" "tbmv_usm.cpp" "syr_usm.cpp" "syr2_usm.cpp" "symv_usm.cpp" "spr_usm.cpp" "spr2_usm.cpp" "spmv_usm.cpp" "sbmv_usm.cpp" "hpr_usm.cpp") if(BUILD_SHARED_LIBS) add_library(blas_level2_rt OBJECT ${L2_SOURCES}) diff --git a/tests/unit_tests/blas/level2/gbmv.cpp b/tests/unit_tests/blas/level2/gbmv.cpp index b03534b30..e16bbf180 100644 --- a/tests/unit_tests/blas/level2/gbmv.cpp +++ b/tests/unit_tests/blas/level2/gbmv.cpp @@ -43,8 +43,8 @@ extern std::vector devices; namespace { template -bool test(const device &dev, onemkl::transpose transa, int m, int n, int kl, int ku, fp alpha, - fp beta, int incx, int incy, int lda) { +int test(const device &dev, onemkl::transpose transa, int m, int n, int kl, int ku, fp alpha, + fp beta, int incx, int incy, int lda) { // Prepare data. int x_len = outer_dimension(transa, m, n); int y_len = inner_dimension(transa, m, n); @@ -103,6 +103,14 @@ bool test(const device &dev, onemkl::transpose transa, int m, int n, int kl, int << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of GBMV:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -110,7 +118,7 @@ bool test(const device &dev, onemkl::transpose transa, int m, int n, int kl, int good = check_equal_vector(y_accessor, y_ref, y_len, incy, std::max(m, n), std::cout); } - return good; + return (int)good; } class GbmvTests : public ::testing::TestWithParam {}; @@ -118,78 +126,78 @@ class GbmvTests : public ::testing::TestWithParam {}; TEST_P(GbmvTests, RealSinglePrecision) { float alpha(2.0); float beta(3.0); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, 2, 3, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, -2, - -3, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, alpha, + beta, -2, -3, 42)); + EXPECT_TRUEORSKIP( test(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, 1, 1, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, alpha, beta, 2, 3, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, alpha, beta, -2, -3, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, alpha, beta, 1, 1, 42)); } TEST_P(GbmvTests, RealDoublePrecision) { double alpha(2.0); double beta(3.0); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, 2, 3, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, -2, - -3, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, alpha, + beta, -2, -3, 42)); + EXPECT_TRUEORSKIP( test(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, 1, 1, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, alpha, beta, 2, 3, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, alpha, beta, -2, -3, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, alpha, beta, 1, 1, 42)); } TEST_P(GbmvTests, ComplexSinglePrecision) { std::complex alpha(2.0, -0.5); std::complex beta(3.0, -1.5); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, - alpha, beta, 2, 3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, - alpha, beta, -2, -3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, - alpha, beta, 1, 1, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, alpha, - beta, 2, 3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, alpha, - beta, -2, -3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, alpha, - beta, 1, 1, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, 5, 7, - alpha, beta, 2, 3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, 5, 7, - alpha, beta, -2, -3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, 5, 7, - alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, + 7, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, + 7, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, + 7, alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, + alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, + alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, + alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, 5, + 7, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, 5, + 7, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, 5, + 7, alpha, beta, 1, 1, 42)); } TEST_P(GbmvTests, ComplexDoublePrecision) { std::complex alpha(2.0, -0.5); std::complex beta(3.0, -1.5); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, - alpha, beta, 2, 3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, - alpha, beta, -2, -3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, - alpha, beta, 1, 1, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, - alpha, beta, 2, 3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, - alpha, beta, -2, -3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, - alpha, beta, 1, 1, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, 5, 7, - alpha, beta, 2, 3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, 5, 7, - alpha, beta, -2, -3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, 5, 7, - alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, + 7, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, + 7, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, + 7, alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, + alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, + alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, + alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, + 5, 7, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, + 5, 7, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, + 5, 7, alpha, beta, 1, 1, 42)); } INSTANTIATE_TEST_SUITE_P(GbmvTestSuite, GbmvTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level2/gbmv_usm.cpp b/tests/unit_tests/blas/level2/gbmv_usm.cpp new file mode 100644 index 000000000..d6a7ebe58 --- /dev/null +++ b/tests/unit_tests/blas/level2/gbmv_usm.cpp @@ -0,0 +1,205 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device &dev, onemkl::transpose transa, int m, int n, int kl, int ku, fp alpha, + fp beta, int incx, int incy, int lda) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const &e) { + std::cout << "Caught asynchronous SYCL exception during GBMV:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), y(ua), A(ua); + int x_len = outer_dimension(transa, m, n); + int y_len = inner_dimension(transa, m, n); + + rand_vector(x, x_len, incx); + rand_vector(y, y_len, incy); + rand_matrix(A, onemkl::transpose::nontrans, m, n, lda); + + auto y_ref = y; + + // Call Reference GBMV. + const int m_ref = m, n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda; + int kl_ref = kl, ku_ref = ku; + using fp_ref = typename ref_type_info::type; + + ::gbmv(convert_to_cblas_trans(transa), &m_ref, &n_ref, &kl_ref, &ku_ref, (fp_ref *)&alpha, + (fp_ref *)A.data(), &lda_ref, (fp_ref *)x.data(), &incx_ref, (fp_ref *)&beta, + (fp_ref *)y_ref.data(), &incy_ref); + + // Call DPC++ GBMV. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::gbmv(main_queue, transa, m, n, kl, ku, alpha, A.data(), lda, x.data(), + incx, beta, y.data(), incy, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::gbmv, + (main_queue, transa, m, n, kl, ku, alpha, A.data(), lda, x.data(), incx, beta, + y.data(), incy, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const &e) { + std::cout << "Caught synchronous SYCL exception during GBMV:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of GBMV:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_vector(y, y_ref, y_len, incy, std::max(m, n), std::cout); + + return (int)good; +} + +class GbmvUsmTests : public ::testing::TestWithParam {}; + +TEST_P(GbmvUsmTests, RealSinglePrecision) { + float alpha(2.0); + float beta(3.0); + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, alpha, + beta, -2, -3, 42)); + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, alpha, beta, 1, 1, 42)); +} +TEST_P(GbmvUsmTests, RealDoublePrecision) { + double alpha(2.0); + double beta(3.0); + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, alpha, + beta, -2, -3, 42)); + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, alpha, beta, 1, 1, 42)); +} +TEST_P(GbmvUsmTests, ComplexSinglePrecision) { + std::complex alpha(2.0, -0.5); + std::complex beta(3.0, -1.5); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, + 7, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, + 7, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, + 7, alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, + alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, + alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, + alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, 5, + 7, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, 5, + 7, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, 5, + 7, alpha, beta, 1, 1, 42)); +} +TEST_P(GbmvUsmTests, ComplexDoublePrecision) { + std::complex alpha(2.0, -0.5); + std::complex beta(3.0, -1.5); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, + 7, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, + 7, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, 5, + 7, alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, + alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, + alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, 25, 30, 5, 7, + alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, + 5, 7, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, + 5, 7, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, + 5, 7, alpha, beta, 1, 1, 42)); +} + +INSTANTIATE_TEST_SUITE_P(GbmvUsmTestSuite, GbmvUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level2/gemv.cpp b/tests/unit_tests/blas/level2/gemv.cpp index ed238d42b..8c2899c8f 100644 --- a/tests/unit_tests/blas/level2/gemv.cpp +++ b/tests/unit_tests/blas/level2/gemv.cpp @@ -43,8 +43,8 @@ extern std::vector devices; namespace { template -bool test(const device &dev, onemkl::transpose transa, int m, int n, fp alpha, fp beta, int incx, - int incy, int lda) { +int test(const device &dev, onemkl::transpose transa, int m, int n, fp alpha, fp beta, int incx, + int incy, int lda) { // Prepare data. int x_len = outer_dimension(transa, m, n); int y_len = inner_dimension(transa, m, n); @@ -102,6 +102,14 @@ bool test(const device &dev, onemkl::transpose transa, int m, int n, fp alpha, f << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of GEMV:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -109,7 +117,7 @@ bool test(const device &dev, onemkl::transpose transa, int m, int n, fp alpha, f good = check_equal_vector(y_accessor, y_ref, y_len, incy, std::max(m, n), std::cout); } - return good; + return (int)good; } class GemvTests : public ::testing::TestWithParam {}; @@ -117,73 +125,78 @@ class GemvTests : public ::testing::TestWithParam {}; TEST_P(GemvTests, RealSinglePrecision) { float alpha(2.0); float beta(3.0); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha, beta, 2, 3, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha, beta, -2, -3, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha, beta, 1, 1, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, 2, 3, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, -2, -3, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, 1, 1, 42)); } TEST_P(GemvTests, RealDoublePrecision) { double alpha(2.0); double beta(3.0); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha, beta, 2, 3, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha, beta, -2, -3, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha, beta, 1, 1, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, 2, 3, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP( test(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, -2, -3, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, 1, 1, 42)); } TEST_P(GemvTests, ComplexSinglePrecision) { std::complex alpha(2.0, -0.5); std::complex beta(3.0, -1.5); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha, - beta, 2, 3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha, - beta, -2, -3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha, - beta, 1, 1, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, - 2, 3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, - -2, -3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, - 1, 1, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, alpha, - beta, 2, 3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, alpha, - beta, -2, -3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, alpha, - beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, + alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, + alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, + alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, + beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, + beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, + beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, + alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, + alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, + alpha, beta, 1, 1, 42)); } TEST_P(GemvTests, ComplexDoublePrecision) { std::complex alpha(2.0, -0.5); std::complex beta(3.0, -1.5); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha, - beta, 2, 3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha, - beta, -2, -3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha, - beta, 1, 1, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, - beta, 2, 3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, - beta, -2, -3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, - beta, 1, 1, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, alpha, - beta, 2, 3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, alpha, - beta, -2, -3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, alpha, - beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, + alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, + alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, + alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, 25, 30, + alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, 25, 30, + alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, 25, 30, + alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, + alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, + alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, + alpha, beta, 1, 1, 42)); } INSTANTIATE_TEST_SUITE_P(GemvTestSuite, GemvTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level2/gemv_usm.cpp b/tests/unit_tests/blas/level2/gemv_usm.cpp new file mode 100644 index 000000000..bec22b770 --- /dev/null +++ b/tests/unit_tests/blas/level2/gemv_usm.cpp @@ -0,0 +1,204 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device &dev, onemkl::transpose transa, int m, int n, fp alpha, fp beta, int incx, + int incy, int lda) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const &e) { + std::cout << "Caught asynchronous SYCL exception during GEMV:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), y(ua), A(ua); + int x_len = outer_dimension(transa, m, n); + int y_len = inner_dimension(transa, m, n); + + rand_vector(x, x_len, incx); + rand_vector(y, y_len, incy); + rand_matrix(A, onemkl::transpose::nontrans, m, n, lda); + + auto y_ref = y; + + // Call Reference GEMV. + const int m_ref = m, n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda; + using fp_ref = typename ref_type_info::type; + + ::gemv(convert_to_cblas_trans(transa), &m_ref, &n_ref, (fp_ref *)&alpha, (fp_ref *)A.data(), + &lda_ref, (fp_ref *)x.data(), &incx_ref, (fp_ref *)&beta, (fp_ref *)y_ref.data(), + &incy_ref); + + // Call DPC++ GEMV. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::gemv(main_queue, transa, m, n, alpha, A.data(), lda, x.data(), incx, + beta, y.data(), incy, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::gemv, + (main_queue, transa, m, n, alpha, A.data(), lda, x.data(), incx, beta, y.data(), + incy, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const &e) { + std::cout << "Caught synchronous SYCL exception during GEMV:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of GEMV:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_vector(y, y_ref, y_len, incy, std::max(m, n), std::cout); + + return (int)good; +} + +class GemvUsmTests : public ::testing::TestWithParam {}; + +TEST_P(GemvUsmTests, RealSinglePrecision) { + float alpha(2.0); + float beta(3.0); + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, 1, 1, 42)); +} +TEST_P(GemvUsmTests, RealDoublePrecision) { + double alpha(2.0); + double beta(3.0); + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::transpose::nontrans, 25, 30, alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::transpose::trans, 25, 30, alpha, beta, 1, 1, 42)); +} +TEST_P(GemvUsmTests, ComplexSinglePrecision) { + std::complex alpha(2.0, -0.5); + std::complex beta(3.0, -1.5); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, + alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, + alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, + alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, + beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, + beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, 25, 30, alpha, + beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, + alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, + alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, + alpha, beta, 1, 1, 42)); +} +TEST_P(GemvUsmTests, ComplexDoublePrecision) { + std::complex alpha(2.0, -0.5); + std::complex beta(3.0, -1.5); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, + alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, + alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, 25, 30, + alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, 25, 30, + alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, 25, 30, + alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, 25, 30, + alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, + alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, + alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, 25, 30, + alpha, beta, 1, 1, 42)); +} + +INSTANTIATE_TEST_SUITE_P(GemvUsmTestSuite, GemvUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level2/ger.cpp b/tests/unit_tests/blas/level2/ger.cpp index 4b05612f6..9bffff959 100644 --- a/tests/unit_tests/blas/level2/ger.cpp +++ b/tests/unit_tests/blas/level2/ger.cpp @@ -43,7 +43,7 @@ extern std::vector devices; namespace { template -bool test(const device &dev, int m, int n, fp alpha, int incx, int incy, int lda) { +int test(const device &dev, int m, int n, fp alpha, int incx, int incy, int lda) { // Prepare data. vector x, y, A_ref, A; @@ -96,6 +96,14 @@ bool test(const device &dev, int m, int n, fp alpha, int incx, int incy, int lda << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of GER:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -103,22 +111,22 @@ bool test(const device &dev, int m, int n, fp alpha, int incx, int incy, int lda good = check_equal_matrix(A_accessor, A_ref, m, n, lda, std::max(m, n), std::cout); } - return good; + return (int)good; } class GerTests : public ::testing::TestWithParam {}; TEST_P(GerTests, RealSinglePrecision) { float alpha(2.0); - EXPECT_TRUE(test(GetParam(), 25, 30, alpha, 2, 3, 42)); - EXPECT_TRUE(test(GetParam(), 25, 30, alpha, -2, -3, 42)); - EXPECT_TRUE(test(GetParam(), 25, 30, alpha, 1, 1, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), 25, 30, alpha, 2, 3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), 25, 30, alpha, -2, -3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), 25, 30, alpha, 1, 1, 42)); } TEST_P(GerTests, RealDoublePrecision) { double alpha(2.0); - EXPECT_TRUE(test(GetParam(), 25, 30, alpha, 2, 3, 42)); - EXPECT_TRUE(test(GetParam(), 25, 30, alpha, -2, -3, 42)); - EXPECT_TRUE(test(GetParam(), 25, 30, alpha, 1, 1, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), 25, 30, alpha, 2, 3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), 25, 30, alpha, -2, -3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), 25, 30, alpha, 1, 1, 42)); } INSTANTIATE_TEST_SUITE_P(GerTestSuite, GerTests, ::testing::ValuesIn(devices), ::DeviceNamePrint()); diff --git a/tests/unit_tests/blas/level2/ger_usm.cpp b/tests/unit_tests/blas/level2/ger_usm.cpp new file mode 100644 index 000000000..810627c1c --- /dev/null +++ b/tests/unit_tests/blas/level2/ger_usm.cpp @@ -0,0 +1,136 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device &dev, int m, int n, fp alpha, int incx, int incy, int lda) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const &e) { + std::cout << "Caught asynchronous SYCL exception during GER:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), y(ua), A(ua); + + rand_vector(x, m, incx); + rand_vector(y, n, incy); + rand_matrix(A, onemkl::transpose::nontrans, m, n, lda); + + auto A_ref = A; + + // Call Reference GER. + const int m_ref = m, n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda; + using fp_ref = typename ref_type_info::type; + + ::ger(&m_ref, &n_ref, (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(), + &incy_ref, (fp_ref *)A_ref.data(), &lda_ref); + + // Call DPC++ GER. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::ger(main_queue, m, n, alpha, x.data(), incx, y.data(), incy, A.data(), + lda, dependencies); + done.wait(); +#else + TEST_RUN_CT( + main_queue, onemkl::blas::ger, + (main_queue, m, n, alpha, x.data(), incx, y.data(), incy, A.data(), lda, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const &e) { + std::cout << "Caught synchronous SYCL exception during GER:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of GER:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_matrix(A, A_ref, m, n, lda, std::max(m, n), std::cout); + + return (int)good; +} + +class GerUsmTests : public ::testing::TestWithParam {}; + +TEST_P(GerUsmTests, RealSinglePrecision) { + float alpha(2.0); + EXPECT_TRUEORSKIP(test(GetParam(), 25, 30, alpha, 2, 3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), 25, 30, alpha, -2, -3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), 25, 30, alpha, 1, 1, 42)); +} +TEST_P(GerUsmTests, RealDoublePrecision) { + double alpha(2.0); + EXPECT_TRUEORSKIP(test(GetParam(), 25, 30, alpha, 2, 3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), 25, 30, alpha, -2, -3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), 25, 30, alpha, 1, 1, 42)); +} + +INSTANTIATE_TEST_SUITE_P(GerUsmTestSuite, GerUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level2/gerc.cpp b/tests/unit_tests/blas/level2/gerc.cpp index 7ac1996ab..7d12c6b1a 100644 --- a/tests/unit_tests/blas/level2/gerc.cpp +++ b/tests/unit_tests/blas/level2/gerc.cpp @@ -43,7 +43,7 @@ extern std::vector devices; namespace { template -bool test(const device &dev, int m, int n, fp alpha, int incx, int incy, int lda) { +int test(const device &dev, int m, int n, fp alpha, int incx, int incy, int lda) { // Prepare data. vector x, y, A_ref, A; @@ -96,6 +96,14 @@ bool test(const device &dev, int m, int n, fp alpha, int incx, int incy, int lda << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of GERC:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -103,22 +111,22 @@ bool test(const device &dev, int m, int n, fp alpha, int incx, int incy, int lda good = check_equal_matrix(A_accessor, A_ref, m, n, lda, std::max(m, n), std::cout); } - return good; + return (int)good; } class GercTests : public ::testing::TestWithParam {}; TEST_P(GercTests, ComplexSinglePrecision) { std::complex alpha(2.0, -0.5); - EXPECT_TRUE(test>(GetParam(), 25, 30, alpha, 2, 3, 42)); - EXPECT_TRUE(test>(GetParam(), 25, 30, alpha, -2, -3, 42)); - EXPECT_TRUE(test>(GetParam(), 25, 30, alpha, 1, 1, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), 25, 30, alpha, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), 25, 30, alpha, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), 25, 30, alpha, 1, 1, 42)); } TEST_P(GercTests, ComplexDoublePrecision) { std::complex alpha(2.0, -0.5); - EXPECT_TRUE(test>(GetParam(), 25, 30, alpha, 2, 3, 42)); - EXPECT_TRUE(test>(GetParam(), 25, 30, alpha, -2, -3, 42)); - EXPECT_TRUE(test>(GetParam(), 25, 30, alpha, 1, 1, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), 25, 30, alpha, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), 25, 30, alpha, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), 25, 30, alpha, 1, 1, 42)); } INSTANTIATE_TEST_SUITE_P(GercTestSuite, GercTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level2/gerc_usm.cpp b/tests/unit_tests/blas/level2/gerc_usm.cpp new file mode 100644 index 000000000..0a8139d7a --- /dev/null +++ b/tests/unit_tests/blas/level2/gerc_usm.cpp @@ -0,0 +1,136 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device &dev, int m, int n, fp alpha, int incx, int incy, int lda) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const &e) { + std::cout << "Caught asynchronous SYCL exception during GERC:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), y(ua), A(ua); + + rand_vector(x, m, incx); + rand_vector(y, n, incy); + rand_matrix(A, onemkl::transpose::nontrans, m, n, lda); + + auto A_ref = A; + + // Call Reference GERC. + const int m_ref = m, n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda; + using fp_ref = typename ref_type_info::type; + + ::gerc(&m_ref, &n_ref, (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(), + &incy_ref, (fp_ref *)A_ref.data(), &lda_ref); + + // Call DPC++ GERC. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::gerc(main_queue, m, n, alpha, x.data(), incx, y.data(), incy, A.data(), + lda, dependencies); + done.wait(); +#else + TEST_RUN_CT( + main_queue, onemkl::blas::gerc, + (main_queue, m, n, alpha, x.data(), incx, y.data(), incy, A.data(), lda, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const &e) { + std::cout << "Caught synchronous SYCL exception during GERC:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of GERC:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_matrix(A, A_ref, m, n, lda, std::max(m, n), std::cout); + + return (int)good; +} + +class GercUsmTests : public ::testing::TestWithParam {}; + +TEST_P(GercUsmTests, ComplexSinglePrecision) { + std::complex alpha(2.0, -0.5); + EXPECT_TRUEORSKIP(test>(GetParam(), 25, 30, alpha, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), 25, 30, alpha, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), 25, 30, alpha, 1, 1, 42)); +} +TEST_P(GercUsmTests, ComplexDoublePrecision) { + std::complex alpha(2.0, -0.5); + EXPECT_TRUEORSKIP(test>(GetParam(), 25, 30, alpha, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), 25, 30, alpha, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), 25, 30, alpha, 1, 1, 42)); +} + +INSTANTIATE_TEST_SUITE_P(GercUsmTestSuite, GercUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level2/geru.cpp b/tests/unit_tests/blas/level2/geru.cpp index 13b423116..de358687d 100644 --- a/tests/unit_tests/blas/level2/geru.cpp +++ b/tests/unit_tests/blas/level2/geru.cpp @@ -43,7 +43,7 @@ extern std::vector devices; namespace { template -bool test(const device &dev, int m, int n, fp alpha, int incx, int incy, int lda) { +int test(const device &dev, int m, int n, fp alpha, int incx, int incy, int lda) { // Prepare data. vector x, y, A_ref, A; @@ -96,6 +96,14 @@ bool test(const device &dev, int m, int n, fp alpha, int incx, int incy, int lda << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of GERU:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -103,22 +111,22 @@ bool test(const device &dev, int m, int n, fp alpha, int incx, int incy, int lda good = check_equal_matrix(A_accessor, A_ref, m, n, lda, std::max(m, n), std::cout); } - return good; + return (int)good; } class GeruTests : public ::testing::TestWithParam {}; TEST_P(GeruTests, ComplexSinglePrecision) { std::complex alpha(2.0, -0.5); - EXPECT_TRUE(test>(GetParam(), 25, 30, alpha, 2, 3, 42)); - EXPECT_TRUE(test>(GetParam(), 25, 30, alpha, -2, -3, 42)); - EXPECT_TRUE(test>(GetParam(), 25, 30, alpha, 1, 1, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), 25, 30, alpha, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), 25, 30, alpha, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), 25, 30, alpha, 1, 1, 42)); } TEST_P(GeruTests, ComplexDoublePrecision) { std::complex alpha(2.0, -0.5); - EXPECT_TRUE(test>(GetParam(), 25, 30, alpha, 2, 3, 42)); - EXPECT_TRUE(test>(GetParam(), 25, 30, alpha, -2, -3, 42)); - EXPECT_TRUE(test>(GetParam(), 25, 30, alpha, 1, 1, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), 25, 30, alpha, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), 25, 30, alpha, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), 25, 30, alpha, 1, 1, 42)); } INSTANTIATE_TEST_SUITE_P(GeruTestSuite, GeruTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level2/geru_usm.cpp b/tests/unit_tests/blas/level2/geru_usm.cpp new file mode 100644 index 000000000..3edcfaf30 --- /dev/null +++ b/tests/unit_tests/blas/level2/geru_usm.cpp @@ -0,0 +1,136 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device &dev, int m, int n, fp alpha, int incx, int incy, int lda) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const &e) { + std::cout << "Caught asynchronous SYCL exception during GERU:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), y(ua), A(ua); + + rand_vector(x, m, incx); + rand_vector(y, n, incy); + rand_matrix(A, onemkl::transpose::nontrans, m, n, lda); + + auto A_ref = A; + + // Call Reference GERU. + const int m_ref = m, n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda; + using fp_ref = typename ref_type_info::type; + + ::geru(&m_ref, &n_ref, (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(), + &incy_ref, (fp_ref *)A_ref.data(), &lda_ref); + + // Call DPC++ GERU. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::geru(main_queue, m, n, alpha, x.data(), incx, y.data(), incy, A.data(), + lda, dependencies); + done.wait(); +#else + TEST_RUN_CT( + main_queue, onemkl::blas::geru, + (main_queue, m, n, alpha, x.data(), incx, y.data(), incy, A.data(), lda, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const &e) { + std::cout << "Caught synchronous SYCL exception during GERU:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of GERU:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_matrix(A, A_ref, m, n, lda, std::max(m, n), std::cout); + + return (int)good; +} + +class GeruUsmTests : public ::testing::TestWithParam {}; + +TEST_P(GeruUsmTests, ComplexSinglePrecision) { + std::complex alpha(2.0, -0.5); + EXPECT_TRUEORSKIP(test>(GetParam(), 25, 30, alpha, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), 25, 30, alpha, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), 25, 30, alpha, 1, 1, 42)); +} +TEST_P(GeruUsmTests, ComplexDoublePrecision) { + std::complex alpha(2.0, -0.5); + EXPECT_TRUEORSKIP(test>(GetParam(), 25, 30, alpha, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), 25, 30, alpha, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), 25, 30, alpha, 1, 1, 42)); +} + +INSTANTIATE_TEST_SUITE_P(GeruUsmTestSuite, GeruUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level2/hbmv.cpp b/tests/unit_tests/blas/level2/hbmv.cpp index 2882f2c26..7d4f45734 100644 --- a/tests/unit_tests/blas/level2/hbmv.cpp +++ b/tests/unit_tests/blas/level2/hbmv.cpp @@ -43,8 +43,8 @@ extern std::vector devices; namespace { template -bool test(const device &dev, onemkl::uplo upper_lower, int n, int k, fp alpha, fp beta, int incx, - int incy, int lda) { +int test(const device &dev, onemkl::uplo upper_lower, int n, int k, fp alpha, fp beta, int incx, + int incy, int lda) { // Prepare data. vector x, y, y_ref, A; @@ -101,6 +101,14 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, int k, fp alpha, f << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of HBMV:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -108,7 +116,7 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, int k, fp alpha, f good = check_equal_vector(y_accessor, y_ref, n, incy, n, std::cout); } - return good; + return (int)good; } class HbmvTests : public ::testing::TestWithParam {}; @@ -116,33 +124,33 @@ class HbmvTests : public ::testing::TestWithParam {}; TEST_P(HbmvTests, ComplexSinglePrecision) { std::complex alpha(2.0, -0.5); std::complex beta(3.0, -1.5); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 2, 3, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 2, 3, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, -2, -3, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, -2, -3, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 1, 1, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 1, 1, 42)); } TEST_P(HbmvTests, ComplexDoublePrecision) { std::complex alpha(2.0, -0.5); std::complex beta(3.0, -1.5); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 2, 3, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 2, 3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, -2, - -3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, -2, - -3, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, + beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, + beta, -2, -3, 42)); + EXPECT_TRUEORSKIP( test>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 1, 1, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 1, 1, 42)); } diff --git a/tests/unit_tests/blas/level2/hbmv_usm.cpp b/tests/unit_tests/blas/level2/hbmv_usm.cpp new file mode 100644 index 000000000..2b00a4515 --- /dev/null +++ b/tests/unit_tests/blas/level2/hbmv_usm.cpp @@ -0,0 +1,159 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device &dev, onemkl::uplo upper_lower, int n, int k, fp alpha, fp beta, int incx, + int incy, int lda) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const &e) { + std::cout << "Caught asynchronous SYCL exception during HBMV:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), y(ua), A(ua); + + rand_vector(x, n, incx); + rand_vector(y, n, incy); + rand_matrix(A, onemkl::transpose::nontrans, n, n, lda); + + auto y_ref = y; + + // Call Reference HBMV. + const int n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda; + const int k_ref = k; + using fp_ref = typename ref_type_info::type; + + ::hbmv(convert_to_cblas_uplo(upper_lower), &n_ref, &k_ref, (fp_ref *)&alpha, (fp_ref *)A.data(), + &lda_ref, (fp_ref *)x.data(), &incx_ref, (fp_ref *)&beta, (fp_ref *)y_ref.data(), + &incy_ref); + + // Call DPC++ HBMV. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::hbmv(main_queue, upper_lower, n, k, alpha, A.data(), lda, x.data(), + incx, beta, y.data(), incy, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::hbmv, + (main_queue, upper_lower, n, k, alpha, A.data(), lda, x.data(), incx, beta, + y.data(), incy, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const &e) { + std::cout << "Caught synchronous SYCL exception during HBMV:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of HBMV:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_vector(y, y_ref, n, incy, n, std::cout); + + return (int)good; +} + +class HbmvUsmTests : public ::testing::TestWithParam {}; + +TEST_P(HbmvUsmTests, ComplexSinglePrecision) { + std::complex alpha(2.0, -0.5); + std::complex beta(3.0, -1.5); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 1, 1, 42)); +} +TEST_P(HbmvUsmTests, ComplexDoublePrecision) { + std::complex alpha(2.0, -0.5); + std::complex beta(3.0, -1.5); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, + beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, + beta, -2, -3, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 1, 1, 42)); +} + +INSTANTIATE_TEST_SUITE_P(HbmvUsmTestSuite, HbmvUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level2/hemv.cpp b/tests/unit_tests/blas/level2/hemv.cpp index ef3c0bbc8..848a0e1ef 100644 --- a/tests/unit_tests/blas/level2/hemv.cpp +++ b/tests/unit_tests/blas/level2/hemv.cpp @@ -43,8 +43,8 @@ extern std::vector devices; namespace { template -bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, int incx, int incy, - int lda) { +int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, int incx, int incy, + int lda) { // Prepare data. vector x, y, y_ref, A; @@ -99,6 +99,14 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of HEMV:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -106,7 +114,7 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, good = check_equal_vector(y_accessor, y_ref, n, incy, n, std::cout); } - return good; + return (int)good; } class HemvTests : public ::testing::TestWithParam {}; @@ -114,33 +122,33 @@ class HemvTests : public ::testing::TestWithParam {}; TEST_P(HemvTests, ComplexSinglePrecision) { std::complex alpha(2.0, -0.5); std::complex beta(3.0, -1.5); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1, 42)); } TEST_P(HemvTests, ComplexDoublePrecision) { std::complex alpha(2.0, -0.5); std::complex beta(3.0, -1.5); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1, 42)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1, 42)); } diff --git a/tests/unit_tests/blas/level2/hemv_usm.cpp b/tests/unit_tests/blas/level2/hemv_usm.cpp new file mode 100644 index 000000000..ab55ceca2 --- /dev/null +++ b/tests/unit_tests/blas/level2/hemv_usm.cpp @@ -0,0 +1,158 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, int incx, int incy, + int lda) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const &e) { + std::cout << "Caught asynchronous SYCL exception during HEMV:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), y(ua), A(ua); + + rand_vector(x, n, incx); + rand_vector(y, n, incy); + rand_matrix(A, onemkl::transpose::nontrans, n, n, lda); + + auto y_ref = y; + + // Call Reference HEMV. + const int n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda; + using fp_ref = typename ref_type_info::type; + + ::hemv(convert_to_cblas_uplo(upper_lower), &n_ref, (fp_ref *)&alpha, (fp_ref *)A.data(), + &lda_ref, (fp_ref *)x.data(), &incx_ref, (fp_ref *)&beta, (fp_ref *)y_ref.data(), + &incy_ref); + + // Call DPC++ HEMV. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::hemv(main_queue, upper_lower, n, alpha, A.data(), lda, x.data(), incx, + beta, y.data(), incy, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::hemv, + (main_queue, upper_lower, n, alpha, A.data(), lda, x.data(), incx, beta, + y.data(), incy, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const &e) { + std::cout << "Caught synchronous SYCL exception during HEMV:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of HEMV:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_vector(y, y_ref, n, incy, n, std::cout); + + return (int)good; +} + +class HemvUsmTests : public ::testing::TestWithParam {}; + +TEST_P(HemvUsmTests, ComplexSinglePrecision) { + std::complex alpha(2.0, -0.5); + std::complex beta(3.0, -1.5); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1, 42)); +} +TEST_P(HemvUsmTests, ComplexDoublePrecision) { + std::complex alpha(2.0, -0.5); + std::complex beta(3.0, -1.5); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1, 42)); +} + +INSTANTIATE_TEST_SUITE_P(HemvUsmTestSuite, HemvUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level2/her.cpp b/tests/unit_tests/blas/level2/her.cpp index 60686797f..e5d79586f 100644 --- a/tests/unit_tests/blas/level2/her.cpp +++ b/tests/unit_tests/blas/level2/her.cpp @@ -43,7 +43,7 @@ extern std::vector devices; namespace { template -bool test(const device &dev, onemkl::uplo upper_lower, int n, fp_scalar alpha, int incx, int lda) { +int test(const device &dev, onemkl::uplo upper_lower, int n, fp_scalar alpha, int incx, int lda) { // Prepare data. vector x, A_ref, A; rand_vector(x, n, incx); @@ -93,6 +93,14 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp_scalar alpha, i << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of HER:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -100,39 +108,39 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp_scalar alpha, i good = check_equal_matrix(A_accessor, A_ref, n, n, lda, n, std::cout); } - return good; + return (int)good; } class HerTests : public ::testing::TestWithParam {}; TEST_P(HerTests, ComplexSinglePrecision) { float alpha(2.0); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( (test, float>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 42))); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( (test, float>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 42))); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( (test, float>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, 42))); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( (test, float>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, 42))); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( (test, float>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 42))); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( (test, float>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 42))); } TEST_P(HerTests, ComplexDoublePrecision) { double alpha(2.0); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( (test, double>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 42))); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( (test, double>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 42))); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( (test, double>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, 42))); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( (test, double>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, 42))); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( (test, double>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 42))); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( (test, double>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 42))); } diff --git a/tests/unit_tests/blas/level2/her2.cpp b/tests/unit_tests/blas/level2/her2.cpp index 4b3d2c1de..dc2a20781 100644 --- a/tests/unit_tests/blas/level2/her2.cpp +++ b/tests/unit_tests/blas/level2/her2.cpp @@ -43,8 +43,8 @@ extern std::vector devices; namespace { template -bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx, int incy, - int lda) { +int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx, int incy, + int lda) { // Prepare data. vector x, y, A_ref, A; @@ -98,6 +98,14 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of HER2:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -105,28 +113,40 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx good = check_equal_matrix(A_accessor, A_ref, n, n, lda, n, std::cout); } - return good; + return (int)good; } class Her2Tests : public ::testing::TestWithParam {}; TEST_P(Her2Tests, ComplexSinglePrecision) { std::complex alpha(2.0, -0.5); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1, 42)); } TEST_P(Her2Tests, ComplexDoublePrecision) { std::complex alpha(2.0, -0.5); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1, 42)); } INSTANTIATE_TEST_SUITE_P(Her2TestSuite, Her2Tests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level2/her2_usm.cpp b/tests/unit_tests/blas/level2/her2_usm.cpp new file mode 100644 index 000000000..bb5dc6107 --- /dev/null +++ b/tests/unit_tests/blas/level2/her2_usm.cpp @@ -0,0 +1,155 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx, int incy, + int lda) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const &e) { + std::cout << "Caught asynchronous SYCL exception during HER2:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), y(ua), A(ua); + + rand_vector(x, n, incx); + rand_vector(y, n, incy); + rand_matrix(A, onemkl::transpose::nontrans, n, n, lda); + + auto A_ref = A; + + // Call Reference HER2. + const int n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda; + using fp_ref = typename ref_type_info::type; + + ::her2(convert_to_cblas_uplo(upper_lower), &n_ref, (fp_ref *)&alpha, (fp_ref *)x.data(), + &incx_ref, (fp_ref *)y.data(), &incy_ref, (fp_ref *)A_ref.data(), &lda_ref); + + // Call DPC++ HER2. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::her2(main_queue, upper_lower, n, alpha, x.data(), incx, y.data(), incy, + A.data(), lda, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::her2, + (main_queue, upper_lower, n, alpha, x.data(), incx, y.data(), incy, A.data(), + lda, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const &e) { + std::cout << "Caught synchronous SYCL exception during HER2:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of HER2:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_matrix(A, A_ref, n, n, lda, n, std::cout); + + return (int)good; +} + +class Her2UsmTests : public ::testing::TestWithParam {}; + +TEST_P(Her2UsmTests, ComplexSinglePrecision) { + std::complex alpha(2.0, -0.5); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1, 42)); +} +TEST_P(Her2UsmTests, ComplexDoublePrecision) { + std::complex alpha(2.0, -0.5); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1, 42)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1, 42)); +} + +INSTANTIATE_TEST_SUITE_P(Her2UsmTestSuite, Her2UsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level2/her_usm.cpp b/tests/unit_tests/blas/level2/her_usm.cpp new file mode 100644 index 000000000..e99cb9273 --- /dev/null +++ b/tests/unit_tests/blas/level2/her_usm.cpp @@ -0,0 +1,153 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device &dev, onemkl::uplo upper_lower, int n, fp_scalar alpha, int incx, int lda) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const &e) { + std::cout << "Caught asynchronous SYCL exception during HER:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), A(ua); + rand_vector(x, n, incx); + rand_matrix(A, onemkl::transpose::nontrans, n, n, lda); + + auto A_ref = A; + + // Call Reference HER. + const int n_ref = n, incx_ref = incx, lda_ref = lda; + using fp_ref = typename ref_type_info::type; + using fp_scalar_mkl = typename ref_type_info::type; + + ::her(convert_to_cblas_uplo(upper_lower), &n_ref, (fp_scalar_mkl *)&alpha, (fp_ref *)x.data(), + &incx_ref, (fp_ref *)A_ref.data(), &lda_ref); + + // Call DPC++ HER. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::her(main_queue, upper_lower, n, alpha, x.data(), incx, A.data(), lda, + dependencies); + done.wait(); +#else + TEST_RUN_CT( + main_queue, onemkl::blas::her, + (main_queue, upper_lower, n, alpha, x.data(), incx, A.data(), lda, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const &e) { + std::cout << "Caught synchronous SYCL exception during HER:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of HER:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_matrix(A, A_ref, n, n, lda, n, std::cout); + + return (int)good; +} + +class HerUsmTests : public ::testing::TestWithParam {}; + +TEST_P(HerUsmTests, ComplexSinglePrecision) { + float alpha(2.0); + EXPECT_TRUEORSKIP( + (test, float>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 42))); + EXPECT_TRUEORSKIP( + (test, float>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 42))); + EXPECT_TRUEORSKIP( + (test, float>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, 42))); + EXPECT_TRUEORSKIP( + (test, float>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, 42))); + EXPECT_TRUEORSKIP( + (test, float>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 42))); + EXPECT_TRUEORSKIP( + (test, float>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 42))); +} +TEST_P(HerUsmTests, ComplexDoublePrecision) { + double alpha(2.0); + EXPECT_TRUEORSKIP( + (test, double>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 42))); + EXPECT_TRUEORSKIP( + (test, double>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 42))); + EXPECT_TRUEORSKIP( + (test, double>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, 42))); + EXPECT_TRUEORSKIP( + (test, double>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, 42))); + EXPECT_TRUEORSKIP( + (test, double>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 42))); + EXPECT_TRUEORSKIP( + (test, double>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 42))); +} + +INSTANTIATE_TEST_SUITE_P(HerUsmTestSuite, HerUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level2/hpmv.cpp b/tests/unit_tests/blas/level2/hpmv.cpp index 64e89724e..4df20c625 100644 --- a/tests/unit_tests/blas/level2/hpmv.cpp +++ b/tests/unit_tests/blas/level2/hpmv.cpp @@ -43,8 +43,8 @@ extern std::vector devices; namespace { template -bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, int incx, - int incy) { +int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, int incx, + int incy) { // Prepare data. vector x, y, y_ref, A; rand_vector(x, n, incx); @@ -97,6 +97,14 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of HPMV:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -104,7 +112,7 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, good = check_equal_vector(y_accessor, y_ref, n, incy, n, std::cout); } - return good; + return (int)good; } class HpmvTests : public ::testing::TestWithParam {}; @@ -112,26 +120,34 @@ class HpmvTests : public ::testing::TestWithParam {}; TEST_P(HpmvTests, ComplexSinglePrecision) { std::complex alpha(2.0, -0.5); std::complex beta(3.0, -1.5); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3)); + EXPECT_TRUEORSKIP( test>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1)); } TEST_P(HpmvTests, ComplexDoublePrecision) { std::complex alpha(2.0, -0.5); std::complex beta(3.0, -1.5); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3)); + EXPECT_TRUEORSKIP( test>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3)); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( test>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1)); } INSTANTIATE_TEST_SUITE_P(HpmvTestSuite, HpmvTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level2/hpmv_usm.cpp b/tests/unit_tests/blas/level2/hpmv_usm.cpp new file mode 100644 index 000000000..1e42d96d6 --- /dev/null +++ b/tests/unit_tests/blas/level2/hpmv_usm.cpp @@ -0,0 +1,156 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, int incx, + int incy) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const &e) { + std::cout << "Caught asynchronous SYCL exception during HPMV:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), y(ua), A(ua); + rand_vector(x, n, incx); + rand_vector(y, n, incy); + rand_matrix(A, onemkl::transpose::nontrans, n, n, n); + + auto y_ref = y; + + // Call Reference HPMV. + const int n_ref = n, incx_ref = incx, incy_ref = incy; + using fp_ref = typename ref_type_info::type; + + ::hpmv(convert_to_cblas_uplo(upper_lower), &n_ref, (fp_ref *)&alpha, (fp_ref *)A.data(), + (fp_ref *)x.data(), &incx_ref, (fp_ref *)&beta, (fp_ref *)y_ref.data(), &incy_ref); + + // Call DPC++ HPMV. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::hpmv(main_queue, upper_lower, n, alpha, A.data(), x.data(), incx, beta, + y.data(), incy, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::hpmv, + (main_queue, upper_lower, n, alpha, A.data(), x.data(), incx, beta, y.data(), + incy, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const &e) { + std::cout << "Caught synchronous SYCL exception during HPMV:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of HPMV:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_vector(y, y_ref, n, incy, n, std::cout); + + return (int)good; +} + +class HpmvUsmTests : public ::testing::TestWithParam {}; + +TEST_P(HpmvUsmTests, ComplexSinglePrecision) { + std::complex alpha(2.0, -0.5); + std::complex beta(3.0, -1.5); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1)); +} +TEST_P(HpmvUsmTests, ComplexDoublePrecision) { + std::complex alpha(2.0, -0.5); + std::complex beta(3.0, -1.5); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1)); +} + +INSTANTIATE_TEST_SUITE_P(HpmvUsmTestSuite, HpmvUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level2/hpr.cpp b/tests/unit_tests/blas/level2/hpr.cpp index d16d9ccee..45098543e 100644 --- a/tests/unit_tests/blas/level2/hpr.cpp +++ b/tests/unit_tests/blas/level2/hpr.cpp @@ -43,7 +43,7 @@ extern std::vector devices; namespace { template -bool test(const device &dev, onemkl::uplo upper_lower, int n, fp_scalar alpha, int incx) { +int test(const device &dev, onemkl::uplo upper_lower, int n, fp_scalar alpha, int incx) { // Prepare data. vector x, A_ref, A; rand_vector(x, n, incx); @@ -93,6 +93,14 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp_scalar alpha, i << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of HPR:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -100,34 +108,40 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp_scalar alpha, i good = check_equal_matrix(A_accessor, A_ref, n, n, n, n, std::cout); } - return good; + return (int)good; } class HprTests : public ::testing::TestWithParam {}; TEST_P(HprTests, ComplexSinglePrecision) { float alpha(2.0); - EXPECT_TRUE((test, float>(GetParam(), onemkl::uplo::lower, 30, alpha, 2))); - EXPECT_TRUE((test, float>(GetParam(), onemkl::uplo::upper, 30, alpha, 2))); - EXPECT_TRUE((test, float>(GetParam(), onemkl::uplo::lower, 30, alpha, -2))); - EXPECT_TRUE((test, float>(GetParam(), onemkl::uplo::upper, 30, alpha, -2))); - EXPECT_TRUE((test, float>(GetParam(), onemkl::uplo::lower, 30, alpha, 1))); - EXPECT_TRUE((test, float>(GetParam(), onemkl::uplo::upper, 30, alpha, 1))); + EXPECT_TRUEORSKIP( + (test, float>(GetParam(), onemkl::uplo::lower, 30, alpha, 2))); + EXPECT_TRUEORSKIP( + (test, float>(GetParam(), onemkl::uplo::upper, 30, alpha, 2))); + EXPECT_TRUEORSKIP( + (test, float>(GetParam(), onemkl::uplo::lower, 30, alpha, -2))); + EXPECT_TRUEORSKIP( + (test, float>(GetParam(), onemkl::uplo::upper, 30, alpha, -2))); + EXPECT_TRUEORSKIP( + (test, float>(GetParam(), onemkl::uplo::lower, 30, alpha, 1))); + EXPECT_TRUEORSKIP( + (test, float>(GetParam(), onemkl::uplo::upper, 30, alpha, 1))); } TEST_P(HprTests, ComplexDoublePrecision) { double alpha(2.0); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( (test, double>(GetParam(), onemkl::uplo::lower, 30, alpha, 2))); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( (test, double>(GetParam(), onemkl::uplo::upper, 30, alpha, 2))); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( (test, double>(GetParam(), onemkl::uplo::lower, 30, alpha, -2))); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( (test, double>(GetParam(), onemkl::uplo::upper, 30, alpha, -2))); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( (test, double>(GetParam(), onemkl::uplo::lower, 30, alpha, 1))); - EXPECT_TRUE( + EXPECT_TRUEORSKIP( (test, double>(GetParam(), onemkl::uplo::upper, 30, alpha, 1))); } diff --git a/tests/unit_tests/blas/level2/hpr2.cpp b/tests/unit_tests/blas/level2/hpr2.cpp index 754407bb6..9aab3cc6c 100644 --- a/tests/unit_tests/blas/level2/hpr2.cpp +++ b/tests/unit_tests/blas/level2/hpr2.cpp @@ -43,7 +43,7 @@ extern std::vector devices; namespace { template -bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx, int incy) { +int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx, int incy) { // Prepare data. vector x, y, A_ref, A; rand_vector(x, n, incx); @@ -95,6 +95,14 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of HPR2:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -102,28 +110,32 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx good = check_equal_matrix(A_accessor, A_ref, n, n, n, n, std::cout); } - return good; + return (int)good; } class Hpr2Tests : public ::testing::TestWithParam {}; TEST_P(Hpr2Tests, ComplexSinglePrecision) { std::complex alpha(2.0, -0.5); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1)); } TEST_P(Hpr2Tests, ComplexDoublePrecision) { std::complex alpha(2.0, -0.5); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1)); } INSTANTIATE_TEST_SUITE_P(Hpr2TestSuite, Hpr2Tests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level2/hpr2_usm.cpp b/tests/unit_tests/blas/level2/hpr2_usm.cpp new file mode 100644 index 000000000..4b8b65fe5 --- /dev/null +++ b/tests/unit_tests/blas/level2/hpr2_usm.cpp @@ -0,0 +1,145 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx, int incy) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const &e) { + std::cout << "Caught asynchronous SYCL exception during HPR2:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), y(ua), A(ua); + rand_vector(x, n, incx); + rand_vector(y, n, incy); + rand_matrix(A, onemkl::transpose::nontrans, n, n, n); + + auto A_ref = A; + + // Call Reference HPR2. + const int n_ref = n, incx_ref = incx, incy_ref = incy; + using fp_ref = typename ref_type_info::type; + + ::hpr2(convert_to_cblas_uplo(upper_lower), &n_ref, (fp_ref *)&alpha, (fp_ref *)x.data(), + &incx_ref, (fp_ref *)y.data(), &incy_ref, (fp_ref *)A_ref.data()); + + // Call DPC++ HPR2. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::hpr2(main_queue, upper_lower, n, alpha, x.data(), incx, y.data(), incy, + A.data(), dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::hpr2, + (main_queue, upper_lower, n, alpha, x.data(), incx, y.data(), incy, A.data(), + dependencies)); + main_queue.wait(); +#endif + } + catch (exception const &e) { + std::cout << "Caught synchronous SYCL exception during HPR2:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of HPR2:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_matrix(A, A_ref, n, n, n, n, std::cout); + + return (int)good; +} + +class Hpr2UsmTests : public ::testing::TestWithParam {}; + +TEST_P(Hpr2UsmTests, ComplexSinglePrecision) { + std::complex alpha(2.0, -0.5); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1)); +} +TEST_P(Hpr2UsmTests, ComplexDoublePrecision) { + std::complex alpha(2.0, -0.5); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3)); + EXPECT_TRUEORSKIP( + test>(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1)); +} + +INSTANTIATE_TEST_SUITE_P(Hpr2UsmTestSuite, Hpr2UsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level2/hpr_usm.cpp b/tests/unit_tests/blas/level2/hpr_usm.cpp new file mode 100644 index 000000000..71c3b3d74 --- /dev/null +++ b/tests/unit_tests/blas/level2/hpr_usm.cpp @@ -0,0 +1,153 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device &dev, onemkl::uplo upper_lower, int n, fp_scalar alpha, int incx) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const &e) { + std::cout << "Caught asynchronous SYCL exception during HPR:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), A(ua); + rand_vector(x, n, incx); + rand_matrix(A, onemkl::transpose::nontrans, n, n, n); + + auto A_ref = A; + + // Call Reference HPR. + const int n_ref = n, incx_ref = incx; + using fp_ref = typename ref_type_info::type; + using fp_scalar_mkl = typename ref_type_info::type; + + ::hpr(convert_to_cblas_uplo(upper_lower), &n_ref, (fp_scalar_mkl *)&alpha, (fp_ref *)x.data(), + &incx_ref, (fp_ref *)A_ref.data()); + + // Call DPC++ HPR. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::hpr(main_queue, upper_lower, n, alpha, x.data(), incx, A.data(), + dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::hpr, + (main_queue, upper_lower, n, alpha, x.data(), incx, A.data(), dependencies)); + main_queue.wait(); +#endif + } + catch (exception const &e) { + std::cout << "Caught synchronous SYCL exception during HPR:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of HPR:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_matrix(A, A_ref, n, n, n, n, std::cout); + + return (int)good; +} + +class HprUsmTests : public ::testing::TestWithParam {}; + +TEST_P(HprUsmTests, ComplexSinglePrecision) { + float alpha(2.0); + EXPECT_TRUEORSKIP( + (test, float>(GetParam(), onemkl::uplo::lower, 30, alpha, 2))); + EXPECT_TRUEORSKIP( + (test, float>(GetParam(), onemkl::uplo::upper, 30, alpha, 2))); + EXPECT_TRUEORSKIP( + (test, float>(GetParam(), onemkl::uplo::lower, 30, alpha, -2))); + EXPECT_TRUEORSKIP( + (test, float>(GetParam(), onemkl::uplo::upper, 30, alpha, -2))); + EXPECT_TRUEORSKIP( + (test, float>(GetParam(), onemkl::uplo::lower, 30, alpha, 1))); + EXPECT_TRUEORSKIP( + (test, float>(GetParam(), onemkl::uplo::upper, 30, alpha, 1))); +} + +TEST_P(HprUsmTests, ComplexDoublePrecision) { + double alpha(2.0); + EXPECT_TRUEORSKIP( + (test, double>(GetParam(), onemkl::uplo::lower, 30, alpha, 2))); + EXPECT_TRUEORSKIP( + (test, double>(GetParam(), onemkl::uplo::upper, 30, alpha, 2))); + EXPECT_TRUEORSKIP( + (test, double>(GetParam(), onemkl::uplo::lower, 30, alpha, -2))); + EXPECT_TRUEORSKIP( + (test, double>(GetParam(), onemkl::uplo::upper, 30, alpha, -2))); + EXPECT_TRUEORSKIP( + (test, double>(GetParam(), onemkl::uplo::lower, 30, alpha, 1))); + EXPECT_TRUEORSKIP( + (test, double>(GetParam(), onemkl::uplo::upper, 30, alpha, 1))); +} + +INSTANTIATE_TEST_SUITE_P(HprUsmTestSuite, HprUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level2/sbmv.cpp b/tests/unit_tests/blas/level2/sbmv.cpp index d79e555f1..fdcf0da4c 100644 --- a/tests/unit_tests/blas/level2/sbmv.cpp +++ b/tests/unit_tests/blas/level2/sbmv.cpp @@ -43,8 +43,8 @@ extern std::vector devices; namespace { template -bool test(const device &dev, onemkl::uplo upper_lower, int n, int k, fp alpha, fp beta, int incx, - int incy, int lda) { +int test(const device &dev, onemkl::uplo upper_lower, int n, int k, fp alpha, fp beta, int incx, + int incy, int lda) { // Prepare data. vector x, y, y_ref, A; rand_vector(x, n, incx); @@ -98,6 +98,14 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, int k, fp alpha, f << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of SBMV:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -105,7 +113,7 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, int k, fp alpha, f good = check_equal_vector(y_accessor, y_ref, n, incy, n, std::cout); } - return good; + return (int)good; } class SbmvTests : public ::testing::TestWithParam {}; @@ -113,22 +121,24 @@ class SbmvTests : public ::testing::TestWithParam {}; TEST_P(SbmvTests, RealSinglePrecision) { float alpha(2.0); float beta(3.0); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 2, 3, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 2, 3, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, -2, -3, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, -2, -3, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 1, 1, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 1, 1, 42)); } TEST_P(SbmvTests, RealDoublePrecision) { double alpha(2.0); double beta(3.0); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 2, 3, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 2, 3, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, -2, -3, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, -2, -3, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 1, 1, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 1, 1, 42)); } INSTANTIATE_TEST_SUITE_P(SbmvTestSuite, SbmvTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level2/sbmv_usm.cpp b/tests/unit_tests/blas/level2/sbmv_usm.cpp new file mode 100644 index 000000000..63921f870 --- /dev/null +++ b/tests/unit_tests/blas/level2/sbmv_usm.cpp @@ -0,0 +1,148 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device &dev, onemkl::uplo upper_lower, int n, int k, fp alpha, fp beta, int incx, + int incy, int lda) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const &e) { + std::cout << "Caught asynchronous SYCL exception during SBMV:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), y(ua), A(ua); + rand_vector(x, n, incx); + rand_vector(y, n, incy); + rand_matrix(A, onemkl::transpose::nontrans, n, n, lda); + + auto y_ref = y; + + // Call Reference SBMV. + const int n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda; + const int k_ref = k; + using fp_ref = typename ref_type_info::type; + + ::sbmv(convert_to_cblas_uplo(upper_lower), &n_ref, &k_ref, (fp_ref *)&alpha, (fp_ref *)A.data(), + &lda_ref, (fp_ref *)x.data(), &incx_ref, (fp_ref *)&beta, (fp_ref *)y_ref.data(), + &incy_ref); + + // Call DPC++ SBMV. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::sbmv(main_queue, upper_lower, n, k, alpha, A.data(), lda, x.data(), + incx, beta, y.data(), incy, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::sbmv, + (main_queue, upper_lower, n, k, alpha, A.data(), lda, x.data(), incx, beta, + y.data(), incy, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const &e) { + std::cout << "Caught synchronous SYCL exception during SBMV:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of SBMV:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_vector(y, y_ref, n, incy, n, std::cout); + + return (int)good; +} + +class SbmvUsmTests : public ::testing::TestWithParam {}; + +TEST_P(SbmvUsmTests, RealSinglePrecision) { + float alpha(2.0); + float beta(3.0); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 1, 1, 42)); +} +TEST_P(SbmvUsmTests, RealDoublePrecision) { + double alpha(2.0); + double beta(3.0); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP( + test(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, 5, alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, 5, alpha, beta, 1, 1, 42)); +} + +INSTANTIATE_TEST_SUITE_P(SbmvUsmTestSuite, SbmvUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level2/spmv.cpp b/tests/unit_tests/blas/level2/spmv.cpp index 4f1de876b..6d2831ce3 100644 --- a/tests/unit_tests/blas/level2/spmv.cpp +++ b/tests/unit_tests/blas/level2/spmv.cpp @@ -43,8 +43,8 @@ extern std::vector devices; namespace { template -bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, int incx, - int incy) { +int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, int incx, + int incy) { // Prepare data. vector x, y, y_ref, A; rand_vector(x, n, incx); @@ -97,6 +97,14 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of SPMV:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -104,7 +112,7 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, good = check_equal_vector(y_accessor, y_ref, n, incy, n, std::cout); } - return good; + return (int)good; } class SpmvTests : public ::testing::TestWithParam {}; @@ -112,22 +120,22 @@ class SpmvTests : public ::testing::TestWithParam {}; TEST_P(SpmvTests, RealSinglePrecision) { float alpha(2.0); float beta(3.0); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1)); } TEST_P(SpmvTests, RealDoublePrecision) { double alpha(2.0); double beta(3.0); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1)); } INSTANTIATE_TEST_SUITE_P(SpmvTestSuite, SpmvTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level2/spmv_usm.cpp b/tests/unit_tests/blas/level2/spmv_usm.cpp new file mode 100644 index 000000000..95866fead --- /dev/null +++ b/tests/unit_tests/blas/level2/spmv_usm.cpp @@ -0,0 +1,144 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, int incx, + int incy) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const &e) { + std::cout << "Caught asynchronous SYCL exception during SPMV:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), y(ua), A(ua); + rand_vector(x, n, incx); + rand_vector(y, n, incy); + rand_matrix(A, onemkl::transpose::nontrans, n, n, n); + + auto y_ref = y; + + // Call Reference SPMV. + const int n_ref = n, incx_ref = incx, incy_ref = incy; + using fp_ref = typename ref_type_info::type; + + ::spmv(convert_to_cblas_uplo(upper_lower), &n_ref, (fp_ref *)&alpha, (fp_ref *)A.data(), + (fp_ref *)x.data(), &incx_ref, (fp_ref *)&beta, (fp_ref *)y_ref.data(), &incy_ref); + + // Call DPC++ SPMV. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::spmv(main_queue, upper_lower, n, alpha, A.data(), x.data(), incx, beta, + y.data(), incy, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::spmv, + (main_queue, upper_lower, n, alpha, A.data(), x.data(), incx, beta, y.data(), + incy, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const &e) { + std::cout << "Caught synchronous SYCL exception during SPMV:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of SPMV:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_vector(y, y_ref, n, incy, n, std::cout); + + return (int)good; +} + +class SpmvUsmTests : public ::testing::TestWithParam {}; + +TEST_P(SpmvUsmTests, RealSinglePrecision) { + float alpha(2.0); + float beta(3.0); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1)); +} +TEST_P(SpmvUsmTests, RealDoublePrecision) { + double alpha(2.0); + double beta(3.0); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1)); +} + +INSTANTIATE_TEST_SUITE_P(SpmvUsmTestSuite, SpmvUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level2/spr.cpp b/tests/unit_tests/blas/level2/spr.cpp index b9c457898..522460211 100644 --- a/tests/unit_tests/blas/level2/spr.cpp +++ b/tests/unit_tests/blas/level2/spr.cpp @@ -43,7 +43,7 @@ extern std::vector devices; namespace { template -bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx) { +int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx) { // Prepare data. vector x, A_ref, A; rand_vector(x, n, incx); @@ -92,6 +92,14 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of SPR:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -99,28 +107,28 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx good = check_equal_matrix(A_accessor, A_ref, n, n, n, n, std::cout); } - return good; + return (int)good; } class SprTests : public ::testing::TestWithParam {}; TEST_P(SprTests, RealSinglePrecision) { float alpha(2.0); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, -2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, -2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, 1)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, -2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, -2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 1)); } TEST_P(SprTests, RealDoublePrecision) { double alpha(2.0); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, -2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, -2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, 1)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, -2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, -2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 1)); } INSTANTIATE_TEST_SUITE_P(SprTestSuite, SprTests, ::testing::ValuesIn(devices), ::DeviceNamePrint()); diff --git a/tests/unit_tests/blas/level2/spr2.cpp b/tests/unit_tests/blas/level2/spr2.cpp index ccd2977b0..e3ce90ad7 100644 --- a/tests/unit_tests/blas/level2/spr2.cpp +++ b/tests/unit_tests/blas/level2/spr2.cpp @@ -43,7 +43,7 @@ extern std::vector devices; namespace { template -bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx, int incy) { +int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx, int incy) { // Prepare data. vector x, y, A_ref, A; rand_vector(x, n, incx); @@ -95,6 +95,14 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of SPR2:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -102,28 +110,28 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx good = check_equal_matrix(A_accessor, A_ref, n, n, n, n, std::cout); } - return good; + return (int)good; } class Spr2Tests : public ::testing::TestWithParam {}; TEST_P(Spr2Tests, RealSinglePrecision) { float alpha(2.0); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1)); } TEST_P(Spr2Tests, RealDoublePrecision) { double alpha(2.0); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1)); } INSTANTIATE_TEST_SUITE_P(Spr2TestSuite, Spr2Tests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level2/spr2_usm.cpp b/tests/unit_tests/blas/level2/spr2_usm.cpp new file mode 100644 index 000000000..b07fb7e1b --- /dev/null +++ b/tests/unit_tests/blas/level2/spr2_usm.cpp @@ -0,0 +1,141 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx, int incy) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const &e) { + std::cout << "Caught asynchronous SYCL exception during SPR2:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), y(ua), A(ua); + rand_vector(x, n, incx); + rand_vector(y, n, incy); + rand_matrix(A, onemkl::transpose::nontrans, n, n, n); + + auto A_ref = A; + + // Call Reference SPR2. + const int n_ref = n, incx_ref = incx, incy_ref = incy; + using fp_ref = typename ref_type_info::type; + + ::spr2(convert_to_cblas_uplo(upper_lower), &n_ref, (fp_ref *)&alpha, (fp_ref *)x.data(), + &incx_ref, (fp_ref *)y.data(), &incy_ref, (fp_ref *)A_ref.data()); + + // Call DPC++ SPR2. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::spr2(main_queue, upper_lower, n, alpha, x.data(), incx, y.data(), incy, + A.data(), dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::spr2, + (main_queue, upper_lower, n, alpha, x.data(), incx, y.data(), incy, A.data(), + dependencies)); + main_queue.wait(); +#endif + } + catch (exception const &e) { + std::cout << "Caught synchronous SYCL exception during SPR2:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of SPR2:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_matrix(A, A_ref, n, n, n, n, std::cout); + + return (int)good; +} + +class Spr2UsmTests : public ::testing::TestWithParam {}; + +TEST_P(Spr2UsmTests, RealSinglePrecision) { + float alpha(2.0); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1)); +} +TEST_P(Spr2UsmTests, RealDoublePrecision) { + double alpha(2.0); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1)); +} + +INSTANTIATE_TEST_SUITE_P(Spr2UsmTestSuite, Spr2UsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level2/spr_usm.cpp b/tests/unit_tests/blas/level2/spr_usm.cpp new file mode 100644 index 000000000..09c958d6a --- /dev/null +++ b/tests/unit_tests/blas/level2/spr_usm.cpp @@ -0,0 +1,139 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const &e) { + std::cout << "Caught asynchronous SYCL exception during SPR:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), A(ua); + rand_vector(x, n, incx); + rand_matrix(A, onemkl::transpose::nontrans, n, n, n); + + auto A_ref = A; + + // Call Reference SPR. + const int n_ref = n, incx_ref = incx; + using fp_ref = typename ref_type_info::type; + + ::spr(convert_to_cblas_uplo(upper_lower), &n_ref, (fp_ref *)&alpha, (fp_ref *)x.data(), + &incx_ref, (fp_ref *)A_ref.data()); + + // Call DPC++ SPR. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::spr(main_queue, upper_lower, n, alpha, x.data(), incx, A.data(), + dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::spr, + (main_queue, upper_lower, n, alpha, x.data(), incx, A.data(), dependencies)); + main_queue.wait(); +#endif + } + catch (exception const &e) { + std::cout << "Caught synchronous SYCL exception during SPR:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of SPR:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_matrix(A, A_ref, n, n, n, n, std::cout); + + return (int)good; +} + +class SprUsmTests : public ::testing::TestWithParam {}; + +TEST_P(SprUsmTests, RealSinglePrecision) { + float alpha(2.0); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, -2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, -2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 1)); +} +TEST_P(SprUsmTests, RealDoublePrecision) { + double alpha(2.0); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, -2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, -2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 1)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 1)); +} + +INSTANTIATE_TEST_SUITE_P(SprUsmTestSuite, SprUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level2/symv.cpp b/tests/unit_tests/blas/level2/symv.cpp index a8d154f6b..4152be934 100644 --- a/tests/unit_tests/blas/level2/symv.cpp +++ b/tests/unit_tests/blas/level2/symv.cpp @@ -43,8 +43,8 @@ extern std::vector devices; namespace { template -bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, int incx, int incy, - int lda) { +int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, int incx, int incy, + int lda) { // Prepare data. vector x, y, y_ref, A; rand_vector(x, n, incx); @@ -98,6 +98,14 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of SYMV:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -105,7 +113,7 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, good = check_equal_vector(y_accessor, y_ref, n, incy, n, std::cout); } - return good; + return (int)good; } class SymvTests : public ::testing::TestWithParam {}; @@ -113,22 +121,22 @@ class SymvTests : public ::testing::TestWithParam {}; TEST_P(SymvTests, RealSinglePrecision) { float alpha(2.0); float beta(3.0); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1, 42)); } TEST_P(SymvTests, RealDoublePrecision) { double alpha(2.0); double beta(3.0); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1, 42)); } INSTANTIATE_TEST_SUITE_P(SymvTestSuite, SymvTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level2/symv_usm.cpp b/tests/unit_tests/blas/level2/symv_usm.cpp new file mode 100644 index 000000000..b4569db53 --- /dev/null +++ b/tests/unit_tests/blas/level2/symv_usm.cpp @@ -0,0 +1,145 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, fp beta, int incx, int incy, + int lda) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const &e) { + std::cout << "Caught asynchronous SYCL exception during SYMV:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), y(ua), A(ua); + rand_vector(x, n, incx); + rand_vector(y, n, incy); + rand_matrix(A, onemkl::transpose::nontrans, n, n, lda); + + auto y_ref = y; + + // Call Reference SYMV. + const int n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda; + using fp_ref = typename ref_type_info::type; + + ::symv(convert_to_cblas_uplo(upper_lower), &n_ref, (fp_ref *)&alpha, (fp_ref *)A.data(), + &lda_ref, (fp_ref *)x.data(), &incx_ref, (fp_ref *)&beta, (fp_ref *)y_ref.data(), + &incy_ref); + + // Call DPC++ SYMV. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::symv(main_queue, upper_lower, n, alpha, A.data(), lda, x.data(), incx, + beta, y.data(), incy, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::symv, + (main_queue, upper_lower, n, alpha, A.data(), lda, x.data(), incx, beta, + y.data(), incy, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const &e) { + std::cout << "Caught synchronous SYCL exception during SYMV:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of SYMV:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_vector(y, y_ref, n, incy, n, std::cout); + + return (int)good; +} + +class SymvUsmTests : public ::testing::TestWithParam {}; + +TEST_P(SymvUsmTests, RealSinglePrecision) { + float alpha(2.0); + float beta(3.0); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1, 42)); +} +TEST_P(SymvUsmTests, RealDoublePrecision) { + double alpha(2.0); + double beta(3.0); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 2, 3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, -2, -3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, beta, 1, 1, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, beta, 1, 1, 42)); +} + +INSTANTIATE_TEST_SUITE_P(SymvUsmTestSuite, SymvUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level2/syr.cpp b/tests/unit_tests/blas/level2/syr.cpp index 8d7618f9e..9bc5947f9 100644 --- a/tests/unit_tests/blas/level2/syr.cpp +++ b/tests/unit_tests/blas/level2/syr.cpp @@ -43,7 +43,7 @@ extern std::vector devices; namespace { template -bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx, int lda) { +int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx, int lda) { // Prepare data. vector x, A_ref, A; rand_vector(x, n, incx); @@ -92,6 +92,14 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of SYR:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -99,28 +107,28 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx good = check_equal_matrix(A_accessor, A_ref, n, n, lda, n, std::cout); } - return good; + return (int)good; } class SyrTests : public ::testing::TestWithParam {}; TEST_P(SyrTests, RealSinglePrecision) { float alpha(2.0); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, -2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, -2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, -2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, -2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 42)); } TEST_P(SyrTests, RealDoublePrecision) { double alpha(2.0); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, -2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, -2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, -2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, -2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 42)); } INSTANTIATE_TEST_SUITE_P(SyrTestSuite, SyrTests, ::testing::ValuesIn(devices), ::DeviceNamePrint()); diff --git a/tests/unit_tests/blas/level2/syr2.cpp b/tests/unit_tests/blas/level2/syr2.cpp index 3b704fb22..e0eba5a15 100644 --- a/tests/unit_tests/blas/level2/syr2.cpp +++ b/tests/unit_tests/blas/level2/syr2.cpp @@ -43,8 +43,8 @@ extern std::vector devices; namespace { template -bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx, int incy, - int lda) { +int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx, int incy, + int lda) { // Prepare data. vector x, y, A_ref, A; rand_vector(x, n, incx); @@ -97,6 +97,14 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of SYR2:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -104,28 +112,28 @@ bool test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx good = check_equal_matrix(A_accessor, A_ref, n, n, lda, n, std::cout); } - return good; + return (int)good; } class Syr2Tests : public ::testing::TestWithParam {}; TEST_P(Syr2Tests, RealSinglePrecision) { float alpha(2.0); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1, 42)); } TEST_P(Syr2Tests, RealDoublePrecision) { double alpha(2.0); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1, 42)); } INSTANTIATE_TEST_SUITE_P(Syr2TestSuite, Syr2Tests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level2/syr2_usm.cpp b/tests/unit_tests/blas/level2/syr2_usm.cpp new file mode 100644 index 000000000..a356be237 --- /dev/null +++ b/tests/unit_tests/blas/level2/syr2_usm.cpp @@ -0,0 +1,142 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx, int incy, + int lda) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const &e) { + std::cout << "Caught asynchronous SYCL exception during SYR2:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), y(ua), A(ua); + rand_vector(x, n, incx); + rand_vector(y, n, incy); + rand_matrix(A, onemkl::transpose::nontrans, n, n, lda); + + auto A_ref = A; + + // Call Reference SYR2. + const int n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda; + using fp_ref = typename ref_type_info::type; + + ::syr2(convert_to_cblas_uplo(upper_lower), &n_ref, (fp_ref *)&alpha, (fp_ref *)x.data(), + &incx_ref, (fp_ref *)y.data(), &incy_ref, (fp_ref *)A_ref.data(), &lda_ref); + + // Call DPC++ SYR2. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::syr2(main_queue, upper_lower, n, alpha, x.data(), incx, y.data(), incy, + A.data(), lda, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::syr2, + (main_queue, upper_lower, n, alpha, x.data(), incx, y.data(), incy, A.data(), + lda, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const &e) { + std::cout << "Caught synchronous SYCL exception during SYR2:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of SYR2:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_matrix(A, A_ref, n, n, lda, n, std::cout); + + return (int)good; +} + +class Syr2UsmTests : public ::testing::TestWithParam {}; + +TEST_P(Syr2UsmTests, RealSinglePrecision) { + float alpha(2.0); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1, 42)); +} +TEST_P(Syr2UsmTests, RealDoublePrecision) { + double alpha(2.0); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, -2, -3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, -2, -3, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 1, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 1, 42)); +} + +INSTANTIATE_TEST_SUITE_P(Syr2UsmTestSuite, Syr2UsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level2/syr_usm.cpp b/tests/unit_tests/blas/level2/syr_usm.cpp new file mode 100644 index 000000000..c32347f69 --- /dev/null +++ b/tests/unit_tests/blas/level2/syr_usm.cpp @@ -0,0 +1,140 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device &dev, onemkl::uplo upper_lower, int n, fp alpha, int incx, int lda) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const &e) { + std::cout << "Caught asynchronous SYCL exception during SYR:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), A(ua); + rand_vector(x, n, incx); + rand_matrix(A, onemkl::transpose::nontrans, n, n, lda); + + auto A_ref = A; + + // Call Reference SYR. + const int n_ref = n, incx_ref = incx, lda_ref = lda; + using fp_ref = typename ref_type_info::type; + + ::syr(convert_to_cblas_uplo(upper_lower), &n_ref, (fp_ref *)&alpha, (fp_ref *)x.data(), + &incx_ref, (fp_ref *)A_ref.data(), &lda_ref); + + // Call DPC++ SYR. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::syr(main_queue, upper_lower, n, alpha, x.data(), incx, A.data(), lda, + dependencies); + done.wait(); +#else + TEST_RUN_CT( + main_queue, onemkl::blas::syr, + (main_queue, upper_lower, n, alpha, x.data(), incx, A.data(), lda, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const &e) { + std::cout << "Caught synchronous SYCL exception during SYR:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception &e) { + return test_skipped; + } + + catch (const std::runtime_error &error) { + std::cout << "Error raised during execution of SYR:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_matrix(A, A_ref, n, n, lda, n, std::cout); + + return (int)good; +} + +class SyrUsmTests : public ::testing::TestWithParam {}; + +TEST_P(SyrUsmTests, RealSinglePrecision) { + float alpha(2.0); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, -2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, -2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 42)); +} +TEST_P(SyrUsmTests, RealDoublePrecision) { + double alpha(2.0); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, -2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, -2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, 30, alpha, 1, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, 30, alpha, 1, 42)); +} + +INSTANTIATE_TEST_SUITE_P(SyrUsmTestSuite, SyrUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level2/tbmv.cpp b/tests/unit_tests/blas/level2/tbmv.cpp index 89145cf5a..68b8f6e28 100644 --- a/tests/unit_tests/blas/level2/tbmv.cpp +++ b/tests/unit_tests/blas/level2/tbmv.cpp @@ -43,8 +43,8 @@ extern std::vector devices; namespace { template -bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, - onemkl::diag unit_nonunit, int n, int k, int incx, int lda) { +int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, + onemkl::diag unit_nonunit, int n, int k, int incx, int lda) { // Prepare data. vector x, x_ref, A; rand_vector(x, n, incx); @@ -97,6 +97,14 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of TBMV:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -104,118 +112,122 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, good = check_equal_vector(x_accessor, x_ref, n, incx, n, std::cout); } - return good; + return (int)good; } class TbmvTests : public ::testing::TestWithParam {}; TEST_P(TbmvTests, RealSinglePrecision) { - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, - onemkl::diag::unit, 30, 5, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, - onemkl::diag::unit, 30, 5, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::unit, 30, 5, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::unit, 30, 5, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, - onemkl::diag::nonunit, 30, 5, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, - onemkl::diag::nonunit, 30, 5, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 5, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 5, 2, 42)); } TEST_P(TbmvTests, RealDoublePrecision) { - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, - onemkl::diag::unit, 30, 5, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, - onemkl::diag::unit, 30, 5, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::unit, 30, 5, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::unit, 30, 5, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, - onemkl::diag::nonunit, 30, 5, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, - onemkl::diag::nonunit, 30, 5, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 5, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 5, 2, 42)); } TEST_P(TbmvTests, ComplexSinglePrecision) { - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::unit, 30, 5, 2, - 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::unit, 30, 5, 2, - 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::unit, 30, 5, 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::unit, 30, 5, 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 5, - 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 5, - 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, 5, - 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, 5, - 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 5, 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 5, 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30, - 5, 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30, - 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, 30, + 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::unit, 30, + 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::unit, 30, 5, + 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::unit, 30, 5, + 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, 30, + 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, 30, + 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 30, 5, 2, 42)); } TEST_P(TbmvTests, ComplexDoublePrecision) { - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::unit, 30, 5, - 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::unit, 30, 5, - 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::unit, 30, 5, 2, - 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::unit, 30, 5, 2, - 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 5, - 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 5, - 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, - 5, 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, - 5, 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::nonunit, 30, 5, - 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::nonunit, 30, 5, - 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30, - 5, 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30, - 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::unit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::unit, 30, + 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::unit, 30, + 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 30, 5, 2, 42)); } INSTANTIATE_TEST_SUITE_P(TbmvTestSuite, TbmvTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level2/tbmv_usm.cpp b/tests/unit_tests/blas/level2/tbmv_usm.cpp new file mode 100644 index 000000000..c38ba20a9 --- /dev/null +++ b/tests/unit_tests/blas/level2/tbmv_usm.cpp @@ -0,0 +1,237 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, + onemkl::diag unit_nonunit, int n, int k, int incx, int lda) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const& e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const& e) { + std::cout << "Caught asynchronous SYCL exception during TBMV:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), A(ua); + rand_vector(x, n, incx); + rand_matrix(A, transa, n, n, lda); + + auto x_ref = x; + + // Call Reference TBMV. + const int n_ref = n, incx_ref = incx, lda_ref = lda; + const int k_ref = k; + using fp_ref = typename ref_type_info::type; + + ::tbmv(convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_diag(unit_nonunit), &n_ref, &k_ref, (fp_ref*)A.data(), &lda_ref, + (fp_ref*)x_ref.data(), &incx_ref); + + // Call DPC++ TBMV. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::tbmv(main_queue, upper_lower, transa, unit_nonunit, n, k, A.data(), + lda, x.data(), incx, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::tbmv, + (main_queue, upper_lower, transa, unit_nonunit, n, k, A.data(), lda, x.data(), + incx, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const& e) { + std::cout << "Caught synchronous SYCL exception during TBMV:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of TBMV:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_vector(x, x_ref, n, incx, n, std::cout); + + return (int)good; +} + +class TbmvUsmTests : public ::testing::TestWithParam {}; + +TEST_P(TbmvUsmTests, RealSinglePrecision) { + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 5, 2, 42)); +} +TEST_P(TbmvUsmTests, RealDoublePrecision) { + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 5, 2, 42)); +} +TEST_P(TbmvUsmTests, ComplexSinglePrecision) { + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, 30, + 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::unit, 30, + 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::unit, 30, 5, + 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::unit, 30, 5, + 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, 30, + 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, 30, + 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 30, 5, 2, 42)); +} +TEST_P(TbmvUsmTests, ComplexDoublePrecision) { + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::unit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::unit, 30, + 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::unit, 30, + 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 30, 5, 2, 42)); +} + +INSTANTIATE_TEST_SUITE_P(TbmvUsmTestSuite, TbmvUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level2/tbsv.cpp b/tests/unit_tests/blas/level2/tbsv.cpp index 1a09c8dbf..9f3a3b68f 100644 --- a/tests/unit_tests/blas/level2/tbsv.cpp +++ b/tests/unit_tests/blas/level2/tbsv.cpp @@ -43,8 +43,8 @@ extern std::vector devices; namespace { template -bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, - onemkl::diag unit_nonunit, int n, int k, int incx, int lda) { +int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, + onemkl::diag unit_nonunit, int n, int k, int incx, int lda) { // Prepare data. vector x, x_ref, A; rand_vector(x, n, incx); @@ -97,6 +97,14 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of TBSV:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -104,118 +112,122 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, good = check_equal_trsv_vector(x_accessor, x_ref, n, incx, n, std::cout); } - return good; + return (int)good; } class TbsvTests : public ::testing::TestWithParam {}; TEST_P(TbsvTests, RealSinglePrecision) { - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, - onemkl::diag::unit, 30, 5, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, - onemkl::diag::unit, 30, 5, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::unit, 30, 5, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::unit, 30, 5, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, - onemkl::diag::nonunit, 30, 5, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, - onemkl::diag::nonunit, 30, 5, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 5, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 5, 2, 42)); } TEST_P(TbsvTests, RealDoublePrecision) { - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, - onemkl::diag::unit, 30, 5, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, - onemkl::diag::unit, 30, 5, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::unit, 30, 5, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::unit, 30, 5, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, - onemkl::diag::nonunit, 30, 5, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, - onemkl::diag::nonunit, 30, 5, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 5, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 5, 2, 42)); } TEST_P(TbsvTests, ComplexSinglePrecision) { - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::unit, 30, 5, 2, - 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::unit, 30, 5, 2, - 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::unit, 30, 5, 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::unit, 30, 5, 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 5, - 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 5, - 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, 5, - 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, 5, - 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 5, 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 5, 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30, - 5, 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30, - 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, 30, + 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::unit, 30, + 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::unit, 30, 5, + 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::unit, 30, 5, + 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, 30, + 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, 30, + 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 30, 5, 2, 42)); } TEST_P(TbsvTests, ComplexDoublePrecision) { - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::unit, 30, 5, - 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::unit, 30, 5, - 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::unit, 30, 5, 2, - 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::unit, 30, 5, 2, - 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 5, - 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 5, - 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, - 5, 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, - 5, 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::nonunit, 30, 5, - 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::nonunit, 30, 5, - 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30, - 5, 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30, - 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::unit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::unit, 30, + 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::unit, 30, + 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 30, 5, 2, 42)); } INSTANTIATE_TEST_SUITE_P(TbsvTestSuite, TbsvTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level2/tbsv_usm.cpp b/tests/unit_tests/blas/level2/tbsv_usm.cpp new file mode 100644 index 000000000..b01a03ee8 --- /dev/null +++ b/tests/unit_tests/blas/level2/tbsv_usm.cpp @@ -0,0 +1,237 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, + onemkl::diag unit_nonunit, int n, int k, int incx, int lda) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const& e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const& e) { + std::cout << "Caught asynchronous SYCL exception during TBSV:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), A(ua); + rand_vector(x, n, incx); + rand_trsm_matrix(A, transa, n, n, lda); + + auto x_ref = x; + + // Call Reference TBSV. + const int n_ref = n, incx_ref = incx, lda_ref = lda; + const int k_ref = k; + using fp_ref = typename ref_type_info::type; + + ::tbsv(convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_diag(unit_nonunit), &n_ref, &k_ref, (fp_ref*)A.data(), &lda_ref, + (fp_ref*)x_ref.data(), &incx_ref); + + // Call DPC++ TBSV. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::tbsv(main_queue, upper_lower, transa, unit_nonunit, n, k, A.data(), + lda, x.data(), incx, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::tbsv, + (main_queue, upper_lower, transa, unit_nonunit, n, k, A.data(), lda, x.data(), + incx, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const& e) { + std::cout << "Caught synchronous SYCL exception during TBSV:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of TBSV:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_trsv_vector(x, x_ref, n, incx, n, std::cout); + + return (int)good; +} + +class TbsvUsmTests : public ::testing::TestWithParam {}; + +TEST_P(TbsvUsmTests, RealSinglePrecision) { + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 5, 2, 42)); +} +TEST_P(TbsvUsmTests, RealDoublePrecision) { + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::unit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 5, 2, 42)); +} +TEST_P(TbsvUsmTests, ComplexSinglePrecision) { + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, 30, + 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::unit, 30, + 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::unit, 30, 5, + 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::unit, 30, 5, + 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, 30, + 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, 30, + 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 30, 5, 2, 42)); +} +TEST_P(TbsvUsmTests, ComplexDoublePrecision) { + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::unit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::unit, 30, + 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::unit, 30, + 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, + 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 30, 5, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 30, 5, 2, 42)); +} + +INSTANTIATE_TEST_SUITE_P(TbsvUsmTestSuite, TbsvUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level2/tpmv.cpp b/tests/unit_tests/blas/level2/tpmv.cpp index 678c837b3..c85dacf0d 100644 --- a/tests/unit_tests/blas/level2/tpmv.cpp +++ b/tests/unit_tests/blas/level2/tpmv.cpp @@ -43,8 +43,8 @@ extern std::vector devices; namespace { template -bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, - onemkl::diag unit_nonunit, int n, int incx) { +int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, + onemkl::diag unit_nonunit, int n, int incx) { // Prepare data. vector x, x_ref, A; rand_vector(x, n, incx); @@ -95,6 +95,14 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of TPMV:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -102,106 +110,106 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, good = check_equal_vector(x_accessor, x_ref, n, incx, n, std::cout); } - return good; + return (int)good; } class TpmvTests : public ::testing::TestWithParam {}; TEST_P(TpmvTests, RealSinglePrecision) { - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, - onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, - onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, - onemkl::diag::nonunit, 30, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, - onemkl::diag::nonunit, 30, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2)); } TEST_P(TpmvTests, RealDoublePrecision) { - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, - onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, - onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, - onemkl::diag::nonunit, 30, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, - onemkl::diag::nonunit, 30, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2)); } TEST_P(TpmvTests, ComplexSinglePrecision) { - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, - 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, - 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30, - 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30, - 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 30, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 30, 2)); } TEST_P(TpmvTests, ComplexDoublePrecision) { - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test>( + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test>( + EXPECT_TRUEORSKIP(test>( GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, - 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, - 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30, - 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30, - 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 30, 2)); } INSTANTIATE_TEST_SUITE_P(TpmvTestSuite, TpmvTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level2/tpmv_usm.cpp b/tests/unit_tests/blas/level2/tpmv_usm.cpp new file mode 100644 index 000000000..e208d28f2 --- /dev/null +++ b/tests/unit_tests/blas/level2/tpmv_usm.cpp @@ -0,0 +1,220 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, + onemkl::diag unit_nonunit, int n, int incx) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const& e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const& e) { + std::cout << "Caught asynchronous SYCL exception during TPMV:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), A(ua); + rand_vector(x, n, incx); + rand_matrix(A, transa, n, n, n); + + auto x_ref = x; + + // Call Reference TPMV. + const int n_ref = n, incx_ref = incx; + using fp_ref = typename ref_type_info::type; + + ::tpmv(convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_diag(unit_nonunit), &n_ref, (fp_ref*)A.data(), (fp_ref*)x_ref.data(), + &incx_ref); + + // Call DPC++ TPMV. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::tpmv(main_queue, upper_lower, transa, unit_nonunit, n, A.data(), + x.data(), incx, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::tpmv, + (main_queue, upper_lower, transa, unit_nonunit, n, A.data(), x.data(), incx, + dependencies)); + main_queue.wait(); +#endif + } + catch (exception const& e) { + std::cout << "Caught synchronous SYCL exception during TBMV:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of TPMV:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_vector(x, x_ref, n, incx, n, std::cout); + + return (int)good; +} + +class TpmvUsmTests : public ::testing::TestWithParam {}; + +TEST_P(TpmvUsmTests, RealSinglePrecision) { + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2)); +} +TEST_P(TpmvUsmTests, RealDoublePrecision) { + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2)); +} +TEST_P(TpmvUsmTests, ComplexSinglePrecision) { + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 30, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 30, 2)); +} +TEST_P(TpmvUsmTests, ComplexDoublePrecision) { + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 30, 2)); +} + +INSTANTIATE_TEST_SUITE_P(TpmvUsmTestSuite, TpmvUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level2/tpsv.cpp b/tests/unit_tests/blas/level2/tpsv.cpp index 4e18af350..201b53471 100644 --- a/tests/unit_tests/blas/level2/tpsv.cpp +++ b/tests/unit_tests/blas/level2/tpsv.cpp @@ -43,8 +43,8 @@ extern std::vector devices; namespace { template -bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, - onemkl::diag unit_nonunit, int n, int incx) { +int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, + onemkl::diag unit_nonunit, int n, int incx) { // Prepare data. vector x, x_ref, A; rand_vector(x, n, incx); @@ -95,6 +95,14 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of TPSV:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -102,106 +110,106 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, good = check_equal_trsv_vector(x_accessor, x_ref, n, incx, n, std::cout); } - return good; + return (int)good; } class TpsvTests : public ::testing::TestWithParam {}; TEST_P(TpsvTests, RealSinglePrecision) { - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, - onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, - onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, - onemkl::diag::nonunit, 30, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, - onemkl::diag::nonunit, 30, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2)); } TEST_P(TpsvTests, RealDoublePrecision) { - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, - onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, - onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, - onemkl::diag::nonunit, 30, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, - onemkl::diag::nonunit, 30, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 2)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2)); } TEST_P(TpsvTests, ComplexSinglePrecision) { - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, - 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, - 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30, - 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30, - 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 30, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 30, 2)); } TEST_P(TpsvTests, ComplexDoublePrecision) { - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test>( + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test>( + EXPECT_TRUEORSKIP(test>( GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, - 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, - 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30, - 2)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30, - 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 30, 2)); } INSTANTIATE_TEST_SUITE_P(TpsvTestSuite, TpsvTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level2/tpsv_usm.cpp b/tests/unit_tests/blas/level2/tpsv_usm.cpp new file mode 100644 index 000000000..cb218dc99 --- /dev/null +++ b/tests/unit_tests/blas/level2/tpsv_usm.cpp @@ -0,0 +1,220 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, + onemkl::diag unit_nonunit, int n, int incx) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const& e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const& e) { + std::cout << "Caught asynchronous SYCL exception during TPSV:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), A(ua); + rand_vector(x, n, incx); + rand_trsm_matrix(A, transa, n, n, n); + + auto x_ref = x; + + // Call Reference TPSV. + const int n_ref = n, incx_ref = incx; + using fp_ref = typename ref_type_info::type; + + ::tpsv(convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_diag(unit_nonunit), &n_ref, (fp_ref*)A.data(), (fp_ref*)x_ref.data(), + &incx_ref); + + // Call DPC++ TPSV. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::tpsv(main_queue, upper_lower, transa, unit_nonunit, n, A.data(), + x.data(), incx, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::tpsv, + (main_queue, upper_lower, transa, unit_nonunit, n, A.data(), x.data(), incx, + dependencies)); + main_queue.wait(); +#endif + } + catch (exception const& e) { + std::cout << "Caught synchronous SYCL exception during TPSV:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of TPSV:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_trsv_vector(x, x_ref, n, incx, n, std::cout); + + return (int)good; +} + +class TpsvUsmTests : public ::testing::TestWithParam {}; + +TEST_P(TpsvUsmTests, RealSinglePrecision) { + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2)); +} +TEST_P(TpsvUsmTests, RealDoublePrecision) { + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2)); +} +TEST_P(TpsvUsmTests, ComplexSinglePrecision) { + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 30, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 30, 2)); +} +TEST_P(TpsvUsmTests, ComplexDoublePrecision) { + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 30, 2)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 30, 2)); +} + +INSTANTIATE_TEST_SUITE_P(TpsvUsmTestSuite, TpsvUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level2/trmv.cpp b/tests/unit_tests/blas/level2/trmv.cpp index e64b14155..5678da343 100644 --- a/tests/unit_tests/blas/level2/trmv.cpp +++ b/tests/unit_tests/blas/level2/trmv.cpp @@ -43,8 +43,8 @@ extern std::vector devices; namespace { template -bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, - onemkl::diag unit_nonunit, int n, int incx, int lda) { +int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, + onemkl::diag unit_nonunit, int n, int incx, int lda) { // Prepare data. vector x, x_ref, A; rand_vector(x, n, incx); @@ -96,6 +96,14 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of TRMV:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -103,116 +111,118 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, good = check_equal_vector(x_accessor, x_ref, n, incx, n, std::cout); } - return good; + return (int)good; } class TrmvTests : public ::testing::TestWithParam {}; TEST_P(TrmvTests, RealSinglePrecision) { - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, - onemkl::diag::unit, 30, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, - onemkl::diag::unit, 30, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::unit, 30, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::unit, 30, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, - onemkl::diag::nonunit, 30, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, - onemkl::diag::nonunit, 30, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2, 42)); } TEST_P(TrmvTests, RealDoublePrecision) { - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, - onemkl::diag::unit, 30, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, - onemkl::diag::unit, 30, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::unit, 30, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::unit, 30, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, - onemkl::diag::nonunit, 30, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, - onemkl::diag::nonunit, 30, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2, 42)); } TEST_P(TrmvTests, ComplexSinglePrecision) { - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2, - 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2, - 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::unit, 30, 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::unit, 30, 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2, - 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2, - 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, 2, - 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, 2, - 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30, - 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30, - 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, 30, + 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::unit, 30, + 2, 42)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, 30, + 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, 30, + 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 30, 2, 42)); } TEST_P(TrmvTests, ComplexDoublePrecision) { - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2, - 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2, - 42)); - EXPECT_TRUE(test>( + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::unit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>( GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42)); - EXPECT_TRUE(test>( + EXPECT_TRUEORSKIP(test>( GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2, - 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2, - 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, - 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, - 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2, - 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2, - 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30, - 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30, - 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 30, 2, 42)); } INSTANTIATE_TEST_SUITE_P(TrmvTestSuite, TrmvTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level2/trmv_usm.cpp b/tests/unit_tests/blas/level2/trmv_usm.cpp new file mode 100644 index 000000000..909536b56 --- /dev/null +++ b/tests/unit_tests/blas/level2/trmv_usm.cpp @@ -0,0 +1,232 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, + onemkl::diag unit_nonunit, int n, int incx, int lda) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const& e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const& e) { + std::cout << "Caught asynchronous SYCL exception during TRMV:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), A(ua); + rand_vector(x, n, incx); + rand_matrix(A, transa, n, n, lda); + + auto x_ref = x; + + // Call Reference TRMV. + const int n_ref = n, incx_ref = incx, lda_ref = lda; + using fp_ref = typename ref_type_info::type; + + ::trmv(convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_diag(unit_nonunit), &n_ref, (fp_ref*)A.data(), &lda_ref, + (fp_ref*)x_ref.data(), &incx_ref); + + // Call DPC++ TRMV. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::trmv(main_queue, upper_lower, transa, unit_nonunit, n, A.data(), lda, + x.data(), incx, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::trmv, + (main_queue, upper_lower, transa, unit_nonunit, n, A.data(), lda, x.data(), + incx, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const& e) { + std::cout << "Caught synchronous SYCL exception during TRMV:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of TRMV:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_vector(x, x_ref, n, incx, n, std::cout); + + return (int)good; +} + +class TrmvUsmTests : public ::testing::TestWithParam {}; + +TEST_P(TrmvUsmTests, RealSinglePrecision) { + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2, 42)); +} +TEST_P(TrmvUsmTests, RealDoublePrecision) { + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2, 42)); +} +TEST_P(TrmvUsmTests, ComplexSinglePrecision) { + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, 30, + 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::unit, 30, + 2, 42)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, 30, + 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, 30, + 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 30, 2, 42)); +} +TEST_P(TrmvUsmTests, ComplexDoublePrecision) { + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::unit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 30, 2, 42)); +} + +INSTANTIATE_TEST_SUITE_P(TrmvUsmTestSuite, TrmvUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level2/trsv.cpp b/tests/unit_tests/blas/level2/trsv.cpp index 42daa93d9..1982bd5f4 100644 --- a/tests/unit_tests/blas/level2/trsv.cpp +++ b/tests/unit_tests/blas/level2/trsv.cpp @@ -43,8 +43,8 @@ extern std::vector devices; namespace { template -bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, - onemkl::diag unit_nonunit, int n, int incx, int lda) { +int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, + onemkl::diag unit_nonunit, int n, int incx, int lda) { // Prepare data. vector x, x_ref, A; rand_vector(x, n, incx); @@ -96,6 +96,14 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of TRSV:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -103,116 +111,118 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, good = check_equal_trsv_vector(x_accessor, x_ref, n, incx, n, std::cout); } - return good; + return (int)good; } class TrsvTests : public ::testing::TestWithParam {}; TEST_P(TrsvTests, RealSinglePrecision) { - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, - onemkl::diag::unit, 30, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, - onemkl::diag::unit, 30, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::unit, 30, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::unit, 30, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, - onemkl::diag::nonunit, 30, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, - onemkl::diag::nonunit, 30, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2, 42)); } TEST_P(TrsvTests, RealDoublePrecision) { - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, - onemkl::diag::unit, 30, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, - onemkl::diag::unit, 30, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::unit, 30, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::unit, 30, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, - onemkl::diag::nonunit, 30, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, - onemkl::diag::nonunit, 30, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 2, 42)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2, 42)); } TEST_P(TrsvTests, ComplexSinglePrecision) { - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2, - 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2, - 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::unit, 30, 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::unit, 30, 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2, - 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2, - 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, 2, - 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, 2, - 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - onemkl::diag::nonunit, 30, 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30, - 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30, - 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, 30, + 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::unit, 30, + 2, 42)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, 30, + 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, 30, + 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 30, 2, 42)); } TEST_P(TrsvTests, ComplexDoublePrecision) { - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2, - 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::unit, 30, 2, - 42)); - EXPECT_TRUE(test>( + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::unit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>( GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42)); - EXPECT_TRUE(test>( + EXPECT_TRUEORSKIP(test>( GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2, - 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::unit, 30, 2, - 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, - 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 30, - 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2, - 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::nonunit, 30, 2, - 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30, - 2, 42)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 30, - 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 30, 2, 42)); } INSTANTIATE_TEST_SUITE_P(TrsvTestSuite, TrsvTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level2/trsv_usm.cpp b/tests/unit_tests/blas/level2/trsv_usm.cpp new file mode 100644 index 000000000..1c76f541d --- /dev/null +++ b/tests/unit_tests/blas/level2/trsv_usm.cpp @@ -0,0 +1,232 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose transa, + onemkl::diag unit_nonunit, int n, int incx, int lda) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const& e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const& e) { + std::cout << "Caught asynchronous SYCL exception during TRSV:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector x(ua), A(ua); + rand_vector(x, n, incx); + rand_trsm_matrix(A, transa, n, n, lda); + + auto x_ref = x; + + // Call Reference TRSV. + const int n_ref = n, incx_ref = incx, lda_ref = lda; + using fp_ref = typename ref_type_info::type; + + ::trsv(convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), + convert_to_cblas_diag(unit_nonunit), &n_ref, (fp_ref*)A.data(), &lda_ref, + (fp_ref*)x_ref.data(), &incx_ref); + + // Call DPC++ TRSV. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::trsv(main_queue, upper_lower, transa, unit_nonunit, n, A.data(), lda, + x.data(), incx, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::trsv, + (main_queue, upper_lower, transa, unit_nonunit, n, A.data(), lda, x.data(), + incx, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const& e) { + std::cout << "Caught synchronous SYCL exception during TRSV:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of TRSV:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_trsv_vector(x, x_ref, n, incx, n, std::cout); + + return (int)good; +} + +class TrsvUsmTests : public ::testing::TestWithParam {}; + +TEST_P(TrsvUsmTests, RealSinglePrecision) { + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2, 42)); +} +TEST_P(TrsvUsmTests, RealDoublePrecision) { + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 30, 2, 42)); +} +TEST_P(TrsvUsmTests, ComplexSinglePrecision) { + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, 30, + 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::unit, 30, + 2, 42)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, 30, + 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, 30, + 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 30, 2, 42)); +} +TEST_P(TrsvUsmTests, ComplexDoublePrecision) { + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::unit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, onemkl::diag::unit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, + 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 30, 2, 42)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 30, 2, 42)); +} + +INSTANTIATE_TEST_SUITE_P(TrsvUsmTestSuite, TrsvUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level3/CMakeLists.txt b/tests/unit_tests/blas/level3/CMakeLists.txt index 6845dd95f..a00b64328 100644 --- a/tests/unit_tests/blas/level3/CMakeLists.txt +++ b/tests/unit_tests/blas/level3/CMakeLists.txt @@ -18,7 +18,7 @@ #=============================================================================== # Build object from all test sources -set(L3_SOURCES "gemm.cpp" "symm.cpp" "syrk.cpp" "hemm.cpp" "herk.cpp" "syr2k.cpp" "her2k.cpp" "trmm.cpp" "trsm.cpp") +set(L3_SOURCES "gemm.cpp" "symm.cpp" "syrk.cpp" "hemm.cpp" "herk.cpp" "syr2k.cpp" "her2k.cpp" "trmm.cpp" "trsm.cpp" "gemm_usm.cpp" "symm_usm.cpp" "syrk_usm.cpp" "hemm_usm.cpp" "herk_usm.cpp" "syr2k_usm.cpp" "her2k_usm.cpp" "trmm_usm.cpp" "trsm_usm.cpp") if(BUILD_SHARED_LIBS) add_library(blas_level3_rt OBJECT ${L3_SOURCES}) diff --git a/tests/unit_tests/blas/level3/gemm.cpp b/tests/unit_tests/blas/level3/gemm.cpp index 0693b33f5..51124cf4c 100644 --- a/tests/unit_tests/blas/level3/gemm.cpp +++ b/tests/unit_tests/blas/level3/gemm.cpp @@ -44,8 +44,8 @@ extern std::vector devices; namespace { template -bool test(const device& dev, onemkl::transpose transa, onemkl::transpose transb, int m, int n, - int k, int lda, int ldb, int ldc, fp alpha, fp beta) { +int test(const device& dev, onemkl::transpose transa, onemkl::transpose transb, int m, int n, int k, + int lda, int ldb, int ldc, fp alpha, fp beta) { // Prepare data. vector> A, B, C, C_ref; rand_matrix(A, transa, m, k, lda); @@ -101,11 +101,19 @@ bool test(const device& dev, onemkl::transpose transa, onemkl::transpose transb, << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of GEMM:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. auto C_accessor = C_buffer.template get_access(); bool good = check_equal_matrix(C_accessor, C_ref, m, n, ldc, 10 * k, std::cout); - return good; + return (int)good; } class GemmTests : public ::testing::TestWithParam {}; @@ -113,91 +121,95 @@ class GemmTests : public ::testing::TestWithParam {}; TEST_P(GemmTests, RealSinglePrecision) { float alpha(2.0); float beta(3.0); - EXPECT_TRUE(test(GetParam(), onemkl::transpose::nontrans, onemkl::transpose::nontrans, - 79, 83, 91, 103, 105, 106, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::transpose::nontrans, onemkl::transpose::trans, 79, - 83, 91, 103, 105, 106, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::transpose::trans, onemkl::transpose::nontrans, 79, - 83, 91, 103, 105, 106, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::transpose::trans, onemkl::transpose::trans, 79, 83, - 91, 103, 105, 106, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::transpose::nontrans, + onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, + beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::transpose::nontrans, onemkl::transpose::trans, + 79, 83, 91, 103, 105, 106, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::transpose::trans, onemkl::transpose::nontrans, + 79, 83, 91, 103, 105, 106, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::transpose::trans, onemkl::transpose::trans, + 79, 83, 91, 103, 105, 106, alpha, beta)); } TEST_P(GemmTests, RealDoublePrecision) { double alpha(2.0); double beta(3.0); - EXPECT_TRUE(test(GetParam(), onemkl::transpose::nontrans, onemkl::transpose::nontrans, - 79, 83, 91, 103, 105, 106, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::transpose::nontrans, onemkl::transpose::trans, 79, - 83, 91, 103, 105, 106, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::transpose::trans, onemkl::transpose::nontrans, 79, - 83, 91, 103, 105, 106, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::transpose::trans, onemkl::transpose::trans, 79, 83, - 91, 103, 105, 106, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::transpose::nontrans, + onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, + beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::transpose::nontrans, + onemkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, + beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::transpose::trans, + onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, + beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::transpose::trans, onemkl::transpose::trans, + 79, 83, 91, 103, 105, 106, alpha, beta)); } TEST_P(GemmTests, ComplexSinglePrecision) { std::complex alpha(2.0, -0.5); std::complex beta(3.0, -1.5); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::nontrans, - onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, - alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::nontrans, - onemkl::transpose::trans, 79, 83, 91, 103, 105, 106, - alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::trans, - onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, - alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::trans, - onemkl::transpose::trans, 79, 83, 91, 103, 105, 106, - alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::nontrans, - onemkl::transpose::conjtrans, 79, 83, 91, 103, 105, 106, - alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::trans, - onemkl::transpose::conjtrans, 79, 83, 91, 103, 105, 106, - alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::conjtrans, - onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, - alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::conjtrans, - onemkl::transpose::trans, 79, 83, 91, 103, 105, 106, - alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::conjtrans, - onemkl::transpose::conjtrans, 79, 83, 91, 103, 105, 106, - alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, + onemkl::transpose::nontrans, 79, 83, 91, 103, 105, + 106, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, + onemkl::transpose::trans, 79, 83, 91, 103, 105, 106, + alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, + onemkl::transpose::nontrans, 79, 83, 91, 103, 105, + 106, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, + onemkl::transpose::trans, 79, 83, 91, 103, 105, 106, + alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, + onemkl::transpose::conjtrans, 79, 83, 91, 103, 105, + 106, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, + onemkl::transpose::conjtrans, 79, 83, 91, 103, 105, + 106, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, + onemkl::transpose::nontrans, 79, 83, 91, 103, 105, + 106, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, + onemkl::transpose::trans, 79, 83, 91, 103, 105, 106, + alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, + onemkl::transpose::conjtrans, 79, 83, 91, 103, 105, + 106, alpha, beta)); } TEST_P(GemmTests, ComplexDoublePrecision) { std::complex alpha(2.0, -0.5); std::complex beta(3.0, -1.5); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::nontrans, - onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, - alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::nontrans, - onemkl::transpose::trans, 79, 83, 91, 103, 105, 106, - alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::trans, - onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, - alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::trans, - onemkl::transpose::trans, 79, 83, 91, 103, 105, 106, - alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::nontrans, - onemkl::transpose::conjtrans, 79, 83, 91, 103, 105, 106, - alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::trans, - onemkl::transpose::conjtrans, 79, 83, 91, 103, 105, 106, - alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::conjtrans, - onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, - alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::conjtrans, - onemkl::transpose::trans, 79, 83, 91, 103, 105, 106, - alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::transpose::conjtrans, - onemkl::transpose::conjtrans, 79, 83, 91, 103, 105, 106, - alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, + onemkl::transpose::nontrans, 79, 83, 91, 103, 105, + 106, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, + onemkl::transpose::trans, 79, 83, 91, 103, 105, + 106, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, + onemkl::transpose::nontrans, 79, 83, 91, 103, 105, + 106, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, + onemkl::transpose::trans, 79, 83, 91, 103, 105, + 106, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, + onemkl::transpose::conjtrans, 79, 83, 91, 103, 105, + 106, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, + onemkl::transpose::conjtrans, 79, 83, 91, 103, 105, + 106, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, + onemkl::transpose::nontrans, 79, 83, 91, 103, 105, + 106, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, + onemkl::transpose::trans, 79, 83, 91, 103, 105, + 106, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, + onemkl::transpose::conjtrans, 79, 83, 91, 103, 105, + 106, alpha, beta)); } INSTANTIATE_TEST_SUITE_P(GemmTestSuite, GemmTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level3/gemm_usm.cpp b/tests/unit_tests/blas/level3/gemm_usm.cpp new file mode 100644 index 000000000..6753fe1f3 --- /dev/null +++ b/tests/unit_tests/blas/level3/gemm_usm.cpp @@ -0,0 +1,220 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device& dev, onemkl::transpose transa, onemkl::transpose transb, int m, int n, int k, + int lda, int ldb, int ldc, fp alpha, fp beta) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const& e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const& e) { + std::cout << "Caught asynchronous SYCL exception during GEMM:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector A(ua), B(ua), C(ua); + rand_matrix(A, transa, m, k, lda); + rand_matrix(B, transb, k, n, ldb); + rand_matrix(C, onemkl::transpose::nontrans, m, n, ldc); + + auto C_ref = C; + + // Call Reference GEMM. + const int m_ref = m, n_ref = n, k_ref = k; + const int lda_ref = lda, ldb_ref = ldb, ldc_ref = ldc; + + using fp_ref = typename ref_type_info::type; + + ::gemm(convert_to_cblas_trans(transa), convert_to_cblas_trans(transb), &m_ref, &n_ref, &k_ref, + (fp_ref*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_ref*)B.data(), &ldb_ref, + (fp_ref*)&beta, (fp_ref*)C_ref.data(), &ldc_ref); + + // Call DPC++ GEMM. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::gemm(main_queue, transa, transb, m, n, k, alpha, A.data(), lda, + B.data(), ldb, beta, C.data(), ldc, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::gemm, + (main_queue, transa, transb, m, n, k, alpha, A.data(), lda, B.data(), ldb, beta, + C.data(), ldc, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const& e) { + std::cout << "Caught synchronous SYCL exception during GEMM:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of GEMM:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_matrix(C, C_ref, m, n, ldc, 10 * k, std::cout); + + return (int)good; +} + +class GemmUsmTests : public ::testing::TestWithParam {}; + +TEST_P(GemmUsmTests, RealSinglePrecision) { + float alpha(2.0); + float beta(3.0); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::transpose::nontrans, + onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, + beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::transpose::nontrans, onemkl::transpose::trans, + 79, 83, 91, 103, 105, 106, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::transpose::trans, onemkl::transpose::nontrans, + 79, 83, 91, 103, 105, 106, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::transpose::trans, onemkl::transpose::trans, + 79, 83, 91, 103, 105, 106, alpha, beta)); +} + +TEST_P(GemmUsmTests, RealDoublePrecision) { + double alpha(2.0); + double beta(3.0); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::transpose::nontrans, + onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, + beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::transpose::nontrans, + onemkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, + beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::transpose::trans, + onemkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, + beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::transpose::trans, onemkl::transpose::trans, + 79, 83, 91, 103, 105, 106, alpha, beta)); +} + +TEST_P(GemmUsmTests, ComplexSinglePrecision) { + std::complex alpha(2.0, -0.5); + std::complex beta(3.0, -1.5); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, + onemkl::transpose::nontrans, 79, 83, 91, 103, 105, + 106, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, + onemkl::transpose::trans, 79, 83, 91, 103, 105, 106, + alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, + onemkl::transpose::nontrans, 79, 83, 91, 103, 105, + 106, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, + onemkl::transpose::trans, 79, 83, 91, 103, 105, 106, + alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, + onemkl::transpose::conjtrans, 79, 83, 91, 103, 105, + 106, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, + onemkl::transpose::conjtrans, 79, 83, 91, 103, 105, + 106, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, + onemkl::transpose::nontrans, 79, 83, 91, 103, 105, + 106, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, + onemkl::transpose::trans, 79, 83, 91, 103, 105, 106, + alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, + onemkl::transpose::conjtrans, 79, 83, 91, 103, 105, + 106, alpha, beta)); +} + +TEST_P(GemmUsmTests, ComplexDoublePrecision) { + std::complex alpha(2.0, -0.5); + std::complex beta(3.0, -1.5); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, + onemkl::transpose::nontrans, 79, 83, 91, 103, 105, + 106, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, + onemkl::transpose::trans, 79, 83, 91, 103, 105, + 106, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, + onemkl::transpose::nontrans, 79, 83, 91, 103, 105, + 106, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, + onemkl::transpose::trans, 79, 83, 91, 103, 105, + 106, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::nontrans, + onemkl::transpose::conjtrans, 79, 83, 91, 103, 105, + 106, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::trans, + onemkl::transpose::conjtrans, 79, 83, 91, 103, 105, + 106, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, + onemkl::transpose::nontrans, 79, 83, 91, 103, 105, + 106, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, + onemkl::transpose::trans, 79, 83, 91, 103, 105, + 106, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::transpose::conjtrans, + onemkl::transpose::conjtrans, 79, 83, 91, 103, 105, + 106, alpha, beta)); +} + +INSTANTIATE_TEST_SUITE_P(GemmUsmTestSuite, GemmUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level3/hemm.cpp b/tests/unit_tests/blas/level3/hemm.cpp index 4359d9758..15b25ee4a 100644 --- a/tests/unit_tests/blas/level3/hemm.cpp +++ b/tests/unit_tests/blas/level3/hemm.cpp @@ -44,8 +44,8 @@ extern std::vector devices; namespace { template -bool test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower, int m, int n, - int lda, int ldb, int ldc, fp alpha, fp beta) { +int test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower, int m, int n, + int lda, int ldb, int ldc, fp alpha, fp beta) { // Prepare data. vector> A, B, C, C_ref; if (left_right == onemkl::side::left) @@ -104,13 +104,21 @@ bool test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower, << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of HEMM:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { auto C_accessor = C_buffer.template get_access(); good = check_equal_matrix(C_accessor, C_ref, m, n, ldc, 10 * std::max(m, n), std::cout); } - return good; + return (int)good; } class HemmTests : public ::testing::TestWithParam {}; @@ -118,26 +126,26 @@ class HemmTests : public ::testing::TestWithParam {}; TEST_P(HemmTests, ComplexSinglePrecision) { std::complex alpha(2.0, -0.5); std::complex beta(3.0, -1.5); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, 72, - 27, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, 72, - 27, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, - 27, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, - 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, + 72, 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, + 72, 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, 27, 101, 102, 103, alpha, beta)); } TEST_P(HemmTests, ComplexDoublePrecision) { std::complex alpha(2.0, -0.5); std::complex beta(3.0, -1.5); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, 72, - 27, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, 72, - 27, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, - 27, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, - 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::side::left, onemkl::uplo::lower, 72, 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::side::left, onemkl::uplo::upper, 72, 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, 27, 101, 102, 103, alpha, beta)); } INSTANTIATE_TEST_SUITE_P(HemmTestSuite, HemmTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level3/hemm_usm.cpp b/tests/unit_tests/blas/level3/hemm_usm.cpp new file mode 100644 index 000000000..39dece022 --- /dev/null +++ b/tests/unit_tests/blas/level3/hemm_usm.cpp @@ -0,0 +1,154 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower, int m, int n, + int lda, int ldb, int ldc, fp alpha, fp beta) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const& e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const& e) { + std::cout << "Caught asynchronous SYCL exception during HEMM:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector A(ua), B(ua), C(ua); + if (left_right == onemkl::side::left) + rand_matrix(A, onemkl::transpose::nontrans, m, m, lda); + else + rand_matrix(A, onemkl::transpose::nontrans, n, n, lda); + rand_matrix(B, onemkl::transpose::nontrans, m, n, ldb); + rand_matrix(C, onemkl::transpose::nontrans, m, n, ldc); + + auto C_ref = C; + + // Call Reference HEMM. + const int m_ref = m, n_ref = n; + const int lda_ref = lda, ldb_ref = ldb, ldc_ref = ldc; + + using fp_ref = typename ref_type_info::type; + + ::hemm(convert_to_cblas_side(left_right), convert_to_cblas_uplo(upper_lower), &m_ref, &n_ref, + (fp_ref*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_ref*)B.data(), &ldb_ref, + (fp_ref*)&beta, (fp_ref*)C_ref.data(), &ldc_ref); + + // Call DPC++ HEMM. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::hemm(main_queue, left_right, upper_lower, m, n, alpha, A.data(), lda, + B.data(), ldb, beta, C.data(), ldc, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::hemm, + (main_queue, left_right, upper_lower, m, n, alpha, A.data(), lda, B.data(), ldb, + beta, C.data(), ldc, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const& e) { + std::cout << "Caught synchronous SYCL exception during HEMM:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of HEMM:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_matrix(C, C_ref, m, n, ldc, 10 * std::max(m, n), std::cout); + + return (int)good; +} + +class HemmUsmTests : public ::testing::TestWithParam {}; + +TEST_P(HemmUsmTests, ComplexSinglePrecision) { + std::complex alpha(2.0, -0.5); + std::complex beta(3.0, -1.5); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, + 72, 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, + 72, 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, 27, 101, 102, 103, alpha, beta)); +} +TEST_P(HemmUsmTests, ComplexDoublePrecision) { + std::complex alpha(2.0, -0.5); + std::complex beta(3.0, -1.5); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::side::left, onemkl::uplo::lower, 72, 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::side::left, onemkl::uplo::upper, 72, 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, 27, 101, 102, 103, alpha, beta)); +} + +INSTANTIATE_TEST_SUITE_P(HemmUsmTestSuite, HemmUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level3/her2k.cpp b/tests/unit_tests/blas/level3/her2k.cpp index 202ec860e..e7a208a9c 100644 --- a/tests/unit_tests/blas/level3/her2k.cpp +++ b/tests/unit_tests/blas/level3/her2k.cpp @@ -44,8 +44,8 @@ extern std::vector devices; namespace { template -bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, int n, int k, - int lda, int ldb, int ldc, fp alpha, fp_scalar beta) { +int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, int n, int k, + int lda, int ldb, int ldc, fp alpha, fp_scalar beta) { // Prepare data. vector> A, B, C, C_ref; rand_matrix(A, trans, n, k, lda); @@ -102,6 +102,14 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of HER2K:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -109,7 +117,7 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, good = check_equal_matrix(C_accessor, C_ref, n, n, ldc, 10 * std::max(n, k), std::cout); } - return good; + return (int)good; } class Her2kTests : public ::testing::TestWithParam {}; @@ -117,34 +125,34 @@ class Her2kTests : public ::testing::TestWithParam {}; TEST_P(Her2kTests, ComplexSinglePrecision) { std::complex alpha(2.0, -0.5); float beta(1.0); - EXPECT_TRUE((test, float>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, 72, 27, 101, 102, - 103, alpha, beta))); - EXPECT_TRUE((test, float>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, 72, 27, 101, 102, - 103, alpha, beta))); - EXPECT_TRUE((test, float>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::conjtrans, 72, 27, 101, 102, - 103, alpha, beta))); - EXPECT_TRUE((test, float>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::conjtrans, 72, 27, 101, 102, - 103, alpha, beta))); + EXPECT_TRUEORSKIP((test, float>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, 72, 27, 101, + 102, 103, alpha, beta))); + EXPECT_TRUEORSKIP((test, float>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, 72, 27, 101, + 102, 103, alpha, beta))); + EXPECT_TRUEORSKIP((test, float>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, 72, 27, 101, + 102, 103, alpha, beta))); + EXPECT_TRUEORSKIP((test, float>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, 72, 27, 101, + 102, 103, alpha, beta))); } TEST_P(Her2kTests, ComplexDoublePrecision) { std::complex alpha(2.0, -0.5); double beta(1.0); - EXPECT_TRUE((test, double>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, 72, 27, 101, 102, - 103, alpha, beta))); - EXPECT_TRUE((test, double>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, 72, 27, 101, 102, - 103, alpha, beta))); - EXPECT_TRUE((test, double>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::conjtrans, 72, 27, 101, 102, - 103, alpha, beta))); - EXPECT_TRUE((test, double>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::conjtrans, 72, 27, 101, 102, - 103, alpha, beta))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, 72, 27, 101, + 102, 103, alpha, beta))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, 72, 27, 101, + 102, 103, alpha, beta))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, 72, 27, 101, + 102, 103, alpha, beta))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, 72, 27, 101, + 102, 103, alpha, beta))); } INSTANTIATE_TEST_SUITE_P(Her2kTestSuite, Her2kTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level3/her2k_usm.cpp b/tests/unit_tests/blas/level3/her2k_usm.cpp new file mode 100644 index 000000000..7f66a777a --- /dev/null +++ b/tests/unit_tests/blas/level3/her2k_usm.cpp @@ -0,0 +1,160 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, int n, int k, + int lda, int ldb, int ldc, fp alpha, fp_scalar beta) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const& e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const& e) { + std::cout << "Caught asynchronous SYCL exception during HER2K:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector A(ua), B(ua), C(ua); + rand_matrix(A, trans, n, k, lda); + rand_matrix(B, trans, n, k, ldb); + rand_matrix(C, onemkl::transpose::nontrans, n, n, ldc); + + auto C_ref = C; + + // Call Reference HER2K. + const int n_ref = n, k_ref = k; + const int lda_ref = lda, ldb_ref = ldb, ldc_ref = ldc; + + using fp_ref = typename ref_type_info::type; + using fp_scalar_mkl = typename ref_type_info::type; + + ::her2k(convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), &n_ref, &k_ref, + (fp_ref*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_ref*)B.data(), &ldb_ref, + (fp_scalar_mkl*)&beta, (fp_ref*)C_ref.data(), &ldc_ref); + + // Call DPC++ HER2K. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::her2k(main_queue, upper_lower, trans, n, k, alpha, A.data(), lda, + B.data(), ldb, beta, C.data(), ldc, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::her2k, + (main_queue, upper_lower, trans, n, k, alpha, A.data(), lda, B.data(), ldb, + beta, C.data(), ldc, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const& e) { + std::cout << "Caught synchronous SYCL exception during HER2K:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of HER2K:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_matrix(C, C_ref, n, n, ldc, 10 * std::max(n, k), std::cout); + + return (int)good; +} + +class Her2kUsmTests : public ::testing::TestWithParam {}; + +TEST_P(Her2kUsmTests, ComplexSinglePrecision) { + std::complex alpha(2.0, -0.5); + float beta(1.0); + EXPECT_TRUEORSKIP((test, float>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, 72, 27, 101, + 102, 103, alpha, beta))); + EXPECT_TRUEORSKIP((test, float>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, 72, 27, 101, + 102, 103, alpha, beta))); + EXPECT_TRUEORSKIP((test, float>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, 72, 27, 101, + 102, 103, alpha, beta))); + EXPECT_TRUEORSKIP((test, float>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, 72, 27, 101, + 102, 103, alpha, beta))); +} +TEST_P(Her2kUsmTests, ComplexDoublePrecision) { + std::complex alpha(2.0, -0.5); + double beta(1.0); + EXPECT_TRUEORSKIP((test, double>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, 72, 27, 101, + 102, 103, alpha, beta))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, 72, 27, 101, + 102, 103, alpha, beta))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, 72, 27, 101, + 102, 103, alpha, beta))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, 72, 27, 101, + 102, 103, alpha, beta))); +} + +INSTANTIATE_TEST_SUITE_P(Her2kUsmTestSuite, Her2kUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level3/herk.cpp b/tests/unit_tests/blas/level3/herk.cpp index 6e414156c..143589bc4 100644 --- a/tests/unit_tests/blas/level3/herk.cpp +++ b/tests/unit_tests/blas/level3/herk.cpp @@ -44,8 +44,8 @@ extern std::vector devices; namespace { template -bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, int n, int k, - int lda, int ldc, fp_scalar alpha, fp_scalar beta) { +int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, int n, int k, + int lda, int ldc, fp_scalar alpha, fp_scalar beta) { // Prepare data. vector> A, C, C_ref; rand_matrix(A, trans, n, k, lda); @@ -99,6 +99,14 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of HERK:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -106,7 +114,7 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, good = check_equal_matrix(C_accessor, C_ref, n, n, ldc, 10 * std::max(n, k), std::cout); } - return good; + return (int)good; } class HerkTests : public ::testing::TestWithParam {}; @@ -114,34 +122,34 @@ class HerkTests : public ::testing::TestWithParam {}; TEST_P(HerkTests, ComplexSinglePrecision) { float alpha(2.0); float beta(3.0); - EXPECT_TRUE((test, float>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, 72, 27, 101, 103, - alpha, beta))); - EXPECT_TRUE((test, float>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, 72, 27, 101, 103, - alpha, beta))); - EXPECT_TRUE((test, float>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::conjtrans, 72, 27, 101, 103, - alpha, beta))); - EXPECT_TRUE((test, float>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::conjtrans, 72, 27, 101, 103, - alpha, beta))); + EXPECT_TRUEORSKIP((test, float>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, 72, 27, 101, + 103, alpha, beta))); + EXPECT_TRUEORSKIP((test, float>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, 72, 27, 101, + 103, alpha, beta))); + EXPECT_TRUEORSKIP((test, float>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, 72, 27, 101, + 103, alpha, beta))); + EXPECT_TRUEORSKIP((test, float>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, 72, 27, 101, + 103, alpha, beta))); } TEST_P(HerkTests, ComplexDoublePrecision) { double alpha(2.0); double beta(3.0); - EXPECT_TRUE((test, double>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, 72, 27, 101, 103, - alpha, beta))); - EXPECT_TRUE((test, double>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, 72, 27, 101, 103, - alpha, beta))); - EXPECT_TRUE((test, double>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::conjtrans, 72, 27, 101, 103, - alpha, beta))); - EXPECT_TRUE((test, double>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::conjtrans, 72, 27, 101, 103, - alpha, beta))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, 72, 27, 101, + 103, alpha, beta))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, 72, 27, 101, + 103, alpha, beta))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, 72, 27, 101, + 103, alpha, beta))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, 72, 27, 101, + 103, alpha, beta))); } INSTANTIATE_TEST_SUITE_P(HerkTestSuite, HerkTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level3/herk_usm.cpp b/tests/unit_tests/blas/level3/herk_usm.cpp new file mode 100644 index 000000000..82d42140e --- /dev/null +++ b/tests/unit_tests/blas/level3/herk_usm.cpp @@ -0,0 +1,158 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, int n, int k, + int lda, int ldc, fp_scalar alpha, fp_scalar beta) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const& e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const& e) { + std::cout << "Caught asynchronous SYCL exception during HERK:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector A(ua), C(ua); + rand_matrix(A, trans, n, k, lda); + rand_matrix(C, onemkl::transpose::nontrans, n, n, ldc); + + auto C_ref = C; + + // Call Reference HERK. + const int n_ref = n, k_ref = k; + const int lda_ref = lda, ldc_ref = ldc; + + using fp_ref = typename ref_type_info::type; + + ::herk(convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), &n_ref, &k_ref, + (fp_scalar*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_scalar*)&beta, + (fp_ref*)C_ref.data(), &ldc_ref); + + // Call DPC++ HERK. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::herk(main_queue, upper_lower, trans, n, k, alpha, A.data(), lda, beta, + C.data(), ldc, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::herk, + (main_queue, upper_lower, trans, n, k, alpha, A.data(), lda, beta, C.data(), + ldc, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const& e) { + std::cout << "Caught synchronous SYCL exception during HERK:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of HERK:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_matrix(C, C_ref, n, n, ldc, 10 * std::max(n, k), std::cout); + + return (int)good; +} + +class HerkUsmTests : public ::testing::TestWithParam {}; + +TEST_P(HerkUsmTests, ComplexSinglePrecision) { + float alpha(2.0); + float beta(3.0); + EXPECT_TRUEORSKIP((test, float>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, 72, 27, 101, + 103, alpha, beta))); + EXPECT_TRUEORSKIP((test, float>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, 72, 27, 101, + 103, alpha, beta))); + EXPECT_TRUEORSKIP((test, float>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, 72, 27, 101, + 103, alpha, beta))); + EXPECT_TRUEORSKIP((test, float>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, 72, 27, 101, + 103, alpha, beta))); +} +TEST_P(HerkUsmTests, ComplexDoublePrecision) { + double alpha(2.0); + double beta(3.0); + EXPECT_TRUEORSKIP((test, double>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, 72, 27, 101, + 103, alpha, beta))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, 72, 27, 101, + 103, alpha, beta))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::conjtrans, 72, 27, 101, + 103, alpha, beta))); + EXPECT_TRUEORSKIP((test, double>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::conjtrans, 72, 27, 101, + 103, alpha, beta))); +} + +INSTANTIATE_TEST_SUITE_P(HerkUsmTestSuite, HerkUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level3/symm.cpp b/tests/unit_tests/blas/level3/symm.cpp index aa75d51c4..48586680c 100644 --- a/tests/unit_tests/blas/level3/symm.cpp +++ b/tests/unit_tests/blas/level3/symm.cpp @@ -44,8 +44,8 @@ extern std::vector devices; namespace { template -bool test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower, int m, int n, - int lda, int ldb, int ldc, fp alpha, fp beta) { +int test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower, int m, int n, + int lda, int ldb, int ldc, fp alpha, fp beta) { // Prepare data. vector> A, B, C, C_ref; if (left_right == onemkl::side::left) @@ -104,6 +104,14 @@ bool test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower, << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of SYMM:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -111,7 +119,7 @@ bool test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower, good = check_equal_matrix(C_accessor, C_ref, m, n, ldc, 10 * std::max(m, n), std::cout); } - return good; + return (int)good; } class SymmTests : public ::testing::TestWithParam {}; @@ -119,50 +127,50 @@ class SymmTests : public ::testing::TestWithParam {}; TEST_P(SymmTests, RealSinglePrecision) { float alpha(2.0); float beta(3.0); - EXPECT_TRUE(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, 72, 27, 101, 102, - 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, 27, 101, 102, - 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, 72, 27, 101, 102, - 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, 27, 101, 102, - 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, 72, 27, 101, + 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, 27, 101, + 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, 72, 27, 101, + 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, 27, 101, + 102, 103, alpha, beta)); } TEST_P(SymmTests, RealDoublePrecision) { double alpha(2.0); double beta(3.0); - EXPECT_TRUE(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, 72, 27, 101, 102, - 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, 27, 101, 102, - 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, 72, 27, 101, 102, - 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, 27, 101, 102, - 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, 72, 27, 101, + 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, 27, + 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, 72, 27, 101, + 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, 27, + 101, 102, 103, alpha, beta)); } TEST_P(SymmTests, ComplexSinglePrecision) { std::complex alpha(2.0, -0.5); std::complex beta(3.0, -1.5); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, 72, - 27, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, - 27, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, 72, - 27, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, - 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, + 72, 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, + 72, 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, 27, 101, 102, 103, alpha, beta)); } TEST_P(SymmTests, ComplexDoublePrecision) { std::complex alpha(2.0, -0.5); std::complex beta(3.0, -1.5); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, 72, - 27, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, - 27, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, 72, - 27, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, - 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::side::left, onemkl::uplo::lower, 72, 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::side::left, onemkl::uplo::upper, 72, 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, 27, 101, 102, 103, alpha, beta)); } INSTANTIATE_TEST_SUITE_P(SymmTestSuite, SymmTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level3/symm_usm.cpp b/tests/unit_tests/blas/level3/symm_usm.cpp new file mode 100644 index 000000000..b2cadde68 --- /dev/null +++ b/tests/unit_tests/blas/level3/symm_usm.cpp @@ -0,0 +1,178 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower, int m, int n, + int lda, int ldb, int ldc, fp alpha, fp beta) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const& e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const& e) { + std::cout << "Caught asynchronous SYCL exception during SYMM:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector A(ua), B(ua), C(ua); + if (left_right == onemkl::side::left) + rand_matrix(A, onemkl::transpose::nontrans, m, m, lda); + else + rand_matrix(A, onemkl::transpose::nontrans, n, n, lda); + rand_matrix(B, onemkl::transpose::nontrans, m, n, ldb); + rand_matrix(C, onemkl::transpose::nontrans, m, n, ldc); + + auto C_ref = C; + + // Call Reference SYMM. + const int m_ref = m, n_ref = n; + const int lda_ref = lda, ldb_ref = ldb, ldc_ref = ldc; + + using fp_ref = typename ref_type_info::type; + + ::symm(convert_to_cblas_side(left_right), convert_to_cblas_uplo(upper_lower), &m_ref, &n_ref, + (fp_ref*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_ref*)B.data(), &ldb_ref, + (fp_ref*)&beta, (fp_ref*)C_ref.data(), &ldc_ref); + + // Call DPC++ SYMM. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::symm(main_queue, left_right, upper_lower, m, n, alpha, A.data(), lda, + B.data(), ldb, beta, C.data(), ldc, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::symm, + (main_queue, left_right, upper_lower, m, n, alpha, A.data(), lda, B.data(), ldb, + beta, C.data(), ldc, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const& e) { + std::cout << "Caught synchronous SYCL exception during SYMM:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of SYMM:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_matrix(C, C_ref, m, n, ldc, 10 * std::max(m, n), std::cout); + + return (int)good; +} + +class SymmUsmTests : public ::testing::TestWithParam {}; + +TEST_P(SymmUsmTests, RealSinglePrecision) { + float alpha(2.0); + float beta(3.0); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, 72, 27, 101, + 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, 27, 101, + 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, 72, 27, 101, + 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, 27, 101, + 102, 103, alpha, beta)); +} +TEST_P(SymmUsmTests, RealDoublePrecision) { + double alpha(2.0); + double beta(3.0); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, 72, 27, 101, + 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, 27, + 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, 72, 27, 101, + 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, 27, + 101, 102, 103, alpha, beta)); +} +TEST_P(SymmUsmTests, ComplexSinglePrecision) { + std::complex alpha(2.0, -0.5); + std::complex beta(3.0, -1.5); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, + 72, 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, + 72, 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, 27, 101, 102, 103, alpha, beta)); +} +TEST_P(SymmUsmTests, ComplexDoublePrecision) { + std::complex alpha(2.0, -0.5); + std::complex beta(3.0, -1.5); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::side::left, onemkl::uplo::lower, 72, 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::side::right, onemkl::uplo::lower, 72, 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::side::left, onemkl::uplo::upper, 72, 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::side::right, onemkl::uplo::upper, 72, 27, 101, 102, 103, alpha, beta)); +} + +INSTANTIATE_TEST_SUITE_P(SymmUsmTestSuite, SymmUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level3/syr2k.cpp b/tests/unit_tests/blas/level3/syr2k.cpp index 98ea7332d..ade60a741 100644 --- a/tests/unit_tests/blas/level3/syr2k.cpp +++ b/tests/unit_tests/blas/level3/syr2k.cpp @@ -44,8 +44,8 @@ extern std::vector devices; namespace { template -bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, int n, int k, - int lda, int ldb, int ldc, fp alpha, fp beta) { +int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, int n, int k, + int lda, int ldb, int ldc, fp alpha, fp beta) { // Prepare data. vector> A, B, C, C_ref; rand_matrix(A, trans, n, k, lda); @@ -101,6 +101,14 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of SYR2K:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -108,7 +116,7 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, good = check_equal_matrix(C_accessor, C_ref, n, n, ldc, 10 * std::max(n, k), std::cout); } - return good; + return (int)good; } class Syr2kTests : public ::testing::TestWithParam {}; @@ -116,56 +124,58 @@ class Syr2kTests : public ::testing::TestWithParam {}; TEST_P(Syr2kTests, RealSinglePrecision) { float alpha(3.0); float beta(3.0); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, 73, 27, - 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, 73, 27, - 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73, 27, 101, - 102, 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73, 27, 101, - 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, 73, + 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, 73, + 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73, 27, + 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73, 27, + 101, 102, 103, alpha, beta)); } TEST_P(Syr2kTests, RealDoublePrecision) { double alpha(3.0); double beta(3.0); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, 73, 27, - 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, 73, 27, - 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73, 27, 101, - 102, 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73, 27, 101, - 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, 73, + 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, 73, + 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73, + 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73, + 27, 101, 102, 103, alpha, beta)); } TEST_P(Syr2kTests, ComplexSinglePrecision) { std::complex alpha(3.0, -0.5); std::complex beta(3.0, -1.5); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, 73, 27, 101, 102, 103, alpha, - beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, 73, 27, 101, 102, 103, alpha, - beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - 73, 27, 101, 102, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - 73, 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, 73, 27, 101, 102, 103, + alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, 73, 27, 101, 102, 103, + alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, 73, 27, 101, 102, 103, + alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, 73, 27, 101, 102, 103, + alpha, beta)); } TEST_P(Syr2kTests, ComplexDoublePrecision) { std::complex alpha(3.0, -0.5); std::complex beta(3.0, -1.5); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, 73, 27, 101, 102, 103, - alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, 73, 27, 101, 102, 103, - alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::trans, 73, 27, 101, 102, 103, alpha, - beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::trans, 73, 27, 101, 102, 103, alpha, - beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, 73, 27, 101, 102, 103, + alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, 73, 27, 101, 102, 103, + alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, 73, 27, 101, 102, 103, + alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, 73, 27, 101, 102, 103, + alpha, beta)); } INSTANTIATE_TEST_SUITE_P(Syr2kTestSuite, Syr2kTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level3/syr2k_usm.cpp b/tests/unit_tests/blas/level3/syr2k_usm.cpp new file mode 100644 index 000000000..df6432083 --- /dev/null +++ b/tests/unit_tests/blas/level3/syr2k_usm.cpp @@ -0,0 +1,183 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, int n, int k, + int lda, int ldb, int ldc, fp alpha, fp beta) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const& e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const& e) { + std::cout << "Caught asynchronous SYCL exception during SYR2K:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector A(ua), B(ua), C(ua); + rand_matrix(A, trans, n, k, lda); + rand_matrix(B, trans, n, k, ldb); + rand_matrix(C, onemkl::transpose::nontrans, n, n, ldc); + + auto C_ref = C; + + // Call Reference SYR2K. + const int n_ref = n, k_ref = k; + const int lda_ref = lda, ldb_ref = ldb, ldc_ref = ldc; + + using fp_ref = typename ref_type_info::type; + + ::syr2k(convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), &n_ref, &k_ref, + (fp_ref*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_ref*)B.data(), &ldb_ref, + (fp_ref*)&beta, (fp_ref*)C_ref.data(), &ldc_ref); + + // Call DPC++ SYR2K. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::syr2k(main_queue, upper_lower, trans, n, k, alpha, A.data(), lda, + B.data(), ldb, beta, C.data(), ldc, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::syr2k, + (main_queue, upper_lower, trans, n, k, alpha, A.data(), lda, B.data(), ldb, + beta, C.data(), ldc, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const& e) { + std::cout << "Caught synchronous SYCL exception during SYR2K:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of SYR2K:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_matrix(C, C_ref, n, n, ldc, 10 * std::max(n, k), std::cout); + + return (int)good; +} + +class Syr2kUsmTests : public ::testing::TestWithParam {}; + +TEST_P(Syr2kUsmTests, RealSinglePrecision) { + float alpha(3.0); + float beta(3.0); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, 73, + 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, 73, + 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73, 27, + 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73, 27, + 101, 102, 103, alpha, beta)); +} +TEST_P(Syr2kUsmTests, RealDoublePrecision) { + double alpha(3.0); + double beta(3.0); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, 73, + 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, 73, + 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73, + 27, 101, 102, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73, + 27, 101, 102, 103, alpha, beta)); +} +TEST_P(Syr2kUsmTests, ComplexSinglePrecision) { + std::complex alpha(3.0, -0.5); + std::complex beta(3.0, -1.5); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, 73, 27, 101, 102, 103, + alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, 73, 27, 101, 102, 103, + alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, 73, 27, 101, 102, 103, + alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, 73, 27, 101, 102, 103, + alpha, beta)); +} +TEST_P(Syr2kUsmTests, ComplexDoublePrecision) { + std::complex alpha(3.0, -0.5); + std::complex beta(3.0, -1.5); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, 73, 27, 101, 102, 103, + alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, 73, 27, 101, 102, 103, + alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::trans, 73, 27, 101, 102, 103, + alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::trans, 73, 27, 101, 102, 103, + alpha, beta)); +} + +INSTANTIATE_TEST_SUITE_P(Syr2kUsmTestSuite, Syr2kUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level3/syrk.cpp b/tests/unit_tests/blas/level3/syrk.cpp index 05b4d9e8c..66df33a3f 100644 --- a/tests/unit_tests/blas/level3/syrk.cpp +++ b/tests/unit_tests/blas/level3/syrk.cpp @@ -44,8 +44,8 @@ extern std::vector devices; namespace { template -bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, int n, int k, - int lda, int ldc, fp alpha, fp beta) { +int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, int n, int k, + int lda, int ldc, fp alpha, fp beta) { // Prepare data. vector> A, C, C_ref; rand_matrix(A, trans, n, k, lda); @@ -99,6 +99,14 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of SYRK:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -106,7 +114,7 @@ bool test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, good = check_equal_matrix(C_accessor, C_ref, n, n, ldc, 10 * std::max(n, k), std::cout); } - return good; + return (int)good; } class SyrkTests : public ::testing::TestWithParam {}; @@ -114,53 +122,53 @@ class SyrkTests : public ::testing::TestWithParam {}; TEST_P(SyrkTests, RealSinglePrecision) { float alpha(3.0); float beta(3.0); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, 73, 27, - 101, 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, 73, 27, - 101, 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73, 27, 101, - 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73, 27, 101, - 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, 73, + 27, 101, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, 73, + 27, 101, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73, 27, + 101, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73, 27, + 101, 103, alpha, beta)); } TEST_P(SyrkTests, RealDoublePrecision) { double alpha(3.0); double beta(3.0); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, 73, 27, - 101, 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, 73, 27, - 101, 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73, 27, 101, - 103, alpha, beta)); - EXPECT_TRUE(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73, 27, 101, - 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, 73, + 27, 101, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, 73, + 27, 101, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73, + 27, 101, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73, + 27, 101, 103, alpha, beta)); } TEST_P(SyrkTests, ComplexSinglePrecision) { std::complex alpha(3.0, -0.5); std::complex beta(3.0, -1.5); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, 73, 27, 101, 103, alpha, - beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, 73, 27, 101, 103, alpha, - beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, - 73, 27, 101, 103, alpha, beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, - 73, 27, 101, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, 73, 27, 101, 103, + alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, 73, 27, 101, 103, + alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73, 27, 101, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73, 27, 101, 103, alpha, beta)); } TEST_P(SyrkTests, ComplexDoublePrecision) { std::complex alpha(3.0, -0.5); std::complex beta(3.0, -1.5); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::lower, - onemkl::transpose::nontrans, 73, 27, 101, 103, alpha, - beta)); - EXPECT_TRUE(test>(GetParam(), onemkl::uplo::upper, - onemkl::transpose::nontrans, 73, 27, 101, 103, alpha, - beta)); - EXPECT_TRUE(test>( + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, 73, 27, 101, 103, + alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, 73, 27, 101, 103, + alpha, beta)); + EXPECT_TRUEORSKIP(test>( GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73, 27, 101, 103, alpha, beta)); - EXPECT_TRUE(test>( + EXPECT_TRUEORSKIP(test>( GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73, 27, 101, 103, alpha, beta)); } diff --git a/tests/unit_tests/blas/level3/syrk_usm.cpp b/tests/unit_tests/blas/level3/syrk_usm.cpp new file mode 100644 index 000000000..8f2463f5e --- /dev/null +++ b/tests/unit_tests/blas/level3/syrk_usm.cpp @@ -0,0 +1,178 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device& dev, onemkl::uplo upper_lower, onemkl::transpose trans, int n, int k, + int lda, int ldc, fp alpha, fp beta) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const& e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const& e) { + std::cout << "Caught asynchronous SYCL exception during SYRK:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector A(ua), C(ua); + rand_matrix(A, trans, n, k, lda); + rand_matrix(C, onemkl::transpose::nontrans, n, n, ldc); + + auto C_ref = C; + + // Call Reference SYRK. + const int n_ref = n, k_ref = k; + const int lda_ref = lda, ldc_ref = ldc; + + using fp_ref = typename ref_type_info::type; + + ::syrk(convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), &n_ref, &k_ref, + (fp_ref*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_ref*)&beta, (fp_ref*)C_ref.data(), + &ldc_ref); + + // Call DPC++ SYRK. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::syrk(main_queue, upper_lower, trans, n, k, alpha, A.data(), lda, beta, + C.data(), ldc, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::syrk, + (main_queue, upper_lower, trans, n, k, alpha, A.data(), lda, beta, C.data(), + ldc, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const& e) { + std::cout << "Caught synchronous SYCL exception during SYRK:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of SYRK:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_matrix(C, C_ref, n, n, ldc, 10 * std::max(n, k), std::cout); + + return (int)good; +} + +class SyrkUsmTests : public ::testing::TestWithParam {}; + +TEST_P(SyrkUsmTests, RealSinglePrecision) { + float alpha(3.0); + float beta(3.0); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, 73, + 27, 101, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, 73, + 27, 101, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73, 27, + 101, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73, 27, + 101, 103, alpha, beta)); +} +TEST_P(SyrkUsmTests, RealDoublePrecision) { + double alpha(3.0); + double beta(3.0); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::nontrans, 73, + 27, 101, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::nontrans, 73, + 27, 101, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73, + 27, 101, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73, + 27, 101, 103, alpha, beta)); +} +TEST_P(SyrkUsmTests, ComplexSinglePrecision) { + std::complex alpha(3.0, -0.5); + std::complex beta(3.0, -1.5); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, 73, 27, 101, 103, + alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, 73, 27, 101, 103, + alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73, 27, 101, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73, 27, 101, 103, alpha, beta)); +} +TEST_P(SyrkUsmTests, ComplexDoublePrecision) { + std::complex alpha(3.0, -0.5); + std::complex beta(3.0, -1.5); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::lower, + onemkl::transpose::nontrans, 73, 27, 101, 103, + alpha, beta)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::uplo::upper, + onemkl::transpose::nontrans, 73, 27, 101, 103, + alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::lower, onemkl::transpose::trans, 73, 27, 101, 103, alpha, beta)); + EXPECT_TRUEORSKIP(test>( + GetParam(), onemkl::uplo::upper, onemkl::transpose::trans, 73, 27, 101, 103, alpha, beta)); +} + +INSTANTIATE_TEST_SUITE_P(SyrkUsmTestSuite, SyrkUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level3/trmm.cpp b/tests/unit_tests/blas/level3/trmm.cpp index abe25fdbd..b7395db68 100644 --- a/tests/unit_tests/blas/level3/trmm.cpp +++ b/tests/unit_tests/blas/level3/trmm.cpp @@ -44,9 +44,9 @@ extern std::vector devices; namespace { template -bool test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower, - onemkl::transpose transa, onemkl::diag unit_nonunit, int m, int n, int lda, int ldb, - fp alpha) { +int test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose transa, onemkl::diag unit_nonunit, int m, int n, int lda, int ldb, + fp alpha) { // Prepare data. vector> A, B, B_ref; if (left_right == onemkl::side::right) @@ -104,6 +104,14 @@ bool test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower, << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of TRMM:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -111,166 +119,166 @@ bool test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower, good = check_equal_matrix(B_accessor, B_ref, m, n, ldb, 10 * std::max(m, n), std::cout); } - return good; + return (int)good; } class TrmmTests : public ::testing::TestWithParam {}; TEST_P(TrmmTests, RealSinglePrecision) { float alpha(2.0); - EXPECT_TRUE(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, + alpha)); } TEST_P(TrmmTests, RealDoublePrecision) { double alpha(2.0); - EXPECT_TRUE(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); } TEST_P(TrmmTests, ComplexSinglePrecision) { std::complex alpha(2.0, -0.5); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, 72, + 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, + 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, + 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); } TEST_P(TrmmTests, ComplexDoublePrecision) { std::complex alpha(2.0, -0.5); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::lower, onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::upper, onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); } INSTANTIATE_TEST_SUITE_P(TrmmTestSuite, TrmmTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level3/trmm_usm.cpp b/tests/unit_tests/blas/level3/trmm_usm.cpp new file mode 100644 index 000000000..991dac0f9 --- /dev/null +++ b/tests/unit_tests/blas/level3/trmm_usm.cpp @@ -0,0 +1,287 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose transa, onemkl::diag unit_nonunit, int m, int n, int lda, int ldb, + fp alpha) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const& e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const& e) { + std::cout << "Caught asynchronous SYCL exception during TRMM:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector A(ua), B(ua); + if (left_right == onemkl::side::right) + rand_matrix(A, transa, n, n, lda); + else + rand_matrix(A, transa, m, m, lda); + + rand_matrix(B, onemkl::transpose::nontrans, m, n, ldb); + + auto B_ref = B; + + // Call Reference TRMM. + const int m_ref = m, n_ref = n; + const int lda_ref = lda, ldb_ref = ldb; + + using fp_ref = typename ref_type_info::type; + + ::trmm(convert_to_cblas_side(left_right), convert_to_cblas_uplo(upper_lower), + convert_to_cblas_trans(transa), convert_to_cblas_diag(unit_nonunit), &m_ref, &n_ref, + (fp_ref*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_ref*)B_ref.data(), &ldb_ref); + + // Call DPC++ TRMM. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::trmm(main_queue, left_right, upper_lower, transa, unit_nonunit, m, n, + alpha, A.data(), lda, B.data(), ldb, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::trmm, + (main_queue, left_right, upper_lower, transa, unit_nonunit, m, n, alpha, + A.data(), lda, B.data(), ldb, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const& e) { + std::cout << "Caught synchronous SYCL exception during TRMM:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of TRMM:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_matrix(B, B_ref, m, n, ldb, 10 * std::max(m, n), std::cout); + + return (int)good; +} + +class TrmmUsmTests : public ::testing::TestWithParam {}; + +TEST_P(TrmmUsmTests, RealSinglePrecision) { + float alpha(2.0); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, + alpha)); +} +TEST_P(TrmmUsmTests, RealDoublePrecision) { + double alpha(2.0); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); +} +TEST_P(TrmmUsmTests, ComplexSinglePrecision) { + std::complex alpha(2.0, -0.5); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, 72, + 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, + 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, + 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); +} +TEST_P(TrmmUsmTests, ComplexDoublePrecision) { + std::complex alpha(2.0, -0.5); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::lower, onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::upper, onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); +} + +INSTANTIATE_TEST_SUITE_P(TrmmUsmTestSuite, TrmmUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/blas/level3/trsm.cpp b/tests/unit_tests/blas/level3/trsm.cpp index e742fcbee..3dbfbbcc2 100644 --- a/tests/unit_tests/blas/level3/trsm.cpp +++ b/tests/unit_tests/blas/level3/trsm.cpp @@ -44,9 +44,9 @@ extern std::vector devices; namespace { template -bool test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower, - onemkl::transpose transa, onemkl::diag unit_nonunit, int m, int n, int lda, int ldb, - fp alpha) { +int test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose transa, onemkl::diag unit_nonunit, int m, int n, int lda, int ldb, + fp alpha) { // Prepare data. vector> A, B, B_ref; if (left_right == onemkl::side::right) @@ -104,6 +104,14 @@ bool test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower, << "OpenCL status: " << e.get_cl_code() << std::endl; } + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of TRSM:\n" << error.what() << std::endl; + } + // Compare the results of reference implementation and DPC++ implementation. bool good; { @@ -112,258 +120,262 @@ bool test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower, check_equal_trsm_matrix(B_accessor, B_ref, m, n, ldb, 10 * std::max(m, n), std::cout); } - return good; + return (int)good; } class TrsmTests : public ::testing::TestWithParam {}; TEST_P(TrsmTests, RealSinglePrecision) { float alpha(2.0); - EXPECT_TRUE(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102, alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102, alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102, alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102, alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, + alpha)); } TEST_P(TrsmTests, RealDoublePrecision) { double alpha(2.0); - EXPECT_TRUE(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); - EXPECT_TRUE(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, - alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); } TEST_P(TrsmTests, ComplexSinglePrecision) { std::complex alpha(2.0, -0.5); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, - 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, - 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, - 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, - 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::unit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::unit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::unit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::unit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, 72, + 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::unit, 72, + 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::unit, 72, + 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::unit, 72, + 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::conjtrans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::conjtrans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, + 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, + 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); } TEST_P(TrsmTests, ComplexDoublePrecision) { std::complex alpha(2.0, -0.5); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::unit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::unit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::unit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::unit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::unit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::unit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::unit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::unit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::upper, - onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::upper, - onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, - 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::lower, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); - EXPECT_TRUE(test>(GetParam(), onemkl::side::right, onemkl::uplo::upper, - onemkl::transpose::conjtrans, onemkl::diag::nonunit, 72, - 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::lower, onemkl::transpose::conjtrans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::conjtrans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::upper, onemkl::transpose::conjtrans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::conjtrans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::lower, onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::upper, onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); } INSTANTIATE_TEST_SUITE_P(TrsmTestSuite, TrsmTests, ::testing::ValuesIn(devices), diff --git a/tests/unit_tests/blas/level3/trsm_usm.cpp b/tests/unit_tests/blas/level3/trsm_usm.cpp new file mode 100644 index 000000000..453095ccc --- /dev/null +++ b/tests/unit_tests/blas/level3/trsm_usm.cpp @@ -0,0 +1,383 @@ +/******************************************************************************* +* Copyright 2020 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include "cblas.h" +#include "onemkl/detail/config.hpp" +#include "onemkl/onemkl.hpp" +#include "onemkl_blas_helper.hpp" +#include "reference_blas_templates.hpp" +#include "test_common.hpp" +#include "test_helper.hpp" + +#include + +using namespace cl::sycl; +using std::vector; + +extern std::vector devices; + +namespace { + +template +int test(const device& dev, onemkl::side left_right, onemkl::uplo upper_lower, + onemkl::transpose transa, onemkl::diag unit_nonunit, int m, int n, int lda, int ldb, + fp alpha) { + // Catch asynchronous exceptions. + auto exception_handler = [](exception_list exceptions) { + for (std::exception_ptr const& e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (exception const& e) { + std::cout << "Caught asynchronous SYCL exception during TRSM:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + } + }; + + queue main_queue(dev, exception_handler); + context cxt = main_queue.get_context(); + event done; + std::vector dependencies; + + // Prepare data. + auto ua = usm_allocator(cxt, dev); + vector A(ua), B(ua); + if (left_right == onemkl::side::right) + rand_trsm_matrix(A, transa, n, n, lda); + else + rand_trsm_matrix(A, transa, m, m, lda); + + rand_matrix(B, onemkl::transpose::nontrans, m, n, ldb); + + auto B_ref = B; + + // Call Reference TRSM. + const int m_ref = m, n_ref = n; + const int lda_ref = lda, ldb_ref = ldb; + + using fp_ref = typename ref_type_info::type; + + ::trsm(convert_to_cblas_side(left_right), convert_to_cblas_uplo(upper_lower), + convert_to_cblas_trans(transa), convert_to_cblas_diag(unit_nonunit), &m_ref, &n_ref, + (fp_ref*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_ref*)B_ref.data(), &ldb_ref); + + // Call DPC++ TRSM. + + try { +#ifdef CALL_RT_API + done = onemkl::blas::trsm(main_queue, left_right, upper_lower, transa, unit_nonunit, m, n, + alpha, A.data(), lda, B.data(), ldb, dependencies); + done.wait(); +#else + TEST_RUN_CT(main_queue, onemkl::blas::trsm, + (main_queue, left_right, upper_lower, transa, unit_nonunit, m, n, alpha, + A.data(), lda, B.data(), ldb, dependencies)); + main_queue.wait(); +#endif + } + catch (exception const& e) { + std::cout << "Caught synchronous SYCL exception during TRSM:\n" + << e.what() << std::endl + << "OpenCL status: " << e.get_cl_code() << std::endl; + } + + catch (const onemkl::backend_unsupported_exception& e) { + return test_skipped; + } + + catch (const std::runtime_error& error) { + std::cout << "Error raised during execution of TRSM:\n" << error.what() << std::endl; + } + + // Compare the results of reference implementation and DPC++ implementation. + + bool good = check_equal_trsm_matrix(B, B_ref, m, n, ldb, 10 * std::max(m, n), std::cout); + + return (int)good; +} + +class TrsmUsmTests : public ::testing::TestWithParam {}; + +TEST_P(TrsmUsmTests, RealSinglePrecision) { + float alpha(2.0); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, 102, + alpha)); +} +TEST_P(TrsmUsmTests, RealDoublePrecision) { + double alpha(2.0); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::unit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::unit, 72, 27, 101, 102, + alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); + EXPECT_TRUEORSKIP(test(GetParam(), onemkl::side::right, onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, 27, 101, + 102, alpha)); +} +TEST_P(TrsmUsmTests, ComplexSinglePrecision) { + std::complex alpha(2.0, -0.5); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::unit, 72, + 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::unit, 72, + 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::unit, 72, + 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::unit, 72, + 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::conjtrans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::unit, + 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::conjtrans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::nontrans, onemkl::diag::nonunit, + 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, + 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::trans, onemkl::diag::nonunit, 72, + 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::lower, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, onemkl::uplo::upper, + onemkl::transpose::conjtrans, onemkl::diag::nonunit, + 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); +} +TEST_P(TrsmUsmTests, ComplexDoublePrecision) { + std::complex alpha(2.0, -0.5); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::lower, onemkl::transpose::conjtrans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::conjtrans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::upper, onemkl::transpose::conjtrans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::conjtrans, + onemkl::diag::unit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::nontrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::trans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::trans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::lower, onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::lower, onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::left, + onemkl::uplo::upper, onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); + EXPECT_TRUEORSKIP(test>(GetParam(), onemkl::side::right, + onemkl::uplo::upper, onemkl::transpose::conjtrans, + onemkl::diag::nonunit, 72, 27, 101, 102, alpha)); +} + +INSTANTIATE_TEST_SUITE_P(TrsmUsmTestSuite, TrsmUsmTests, ::testing::ValuesIn(devices), + ::DeviceNamePrint()); + +} // anonymous namespace diff --git a/tests/unit_tests/include/test_helper.hpp b/tests/unit_tests/include/test_helper.hpp index 442dd712c..4e72d6ece 100644 --- a/tests/unit_tests/include/test_helper.hpp +++ b/tests/unit_tests/include/test_helper.hpp @@ -29,6 +29,19 @@ #include #endif +#define test_failed 0 +#define test_passed 1 +#define test_skipped 2 + +#define EXPECT_TRUEORSKIP(a) \ + do { \ + int res = a; \ + if (res == test_skipped) \ + GTEST_SKIP(); \ + else \ + EXPECT_EQ(res, test_passed); \ + } while (0); + #ifdef ENABLE_MKLCPU_BACKEND #define TEST_RUN_INTELCPU(q, func, args) \ func args @@ -95,6 +108,33 @@ static inline void aligned_free(void *p) { ::free(p); #endif } + +/* Support for Unified Shared Memory allocations for different backends */ +static inline void *malloc_shared(size_t align, size_t size, cl::sycl::device dev, + cl::sycl::context ctx) { +#ifdef _WIN64 + return cl::sycl::malloc_shared(size, dev, ctx); +#else + #ifdef ENABLE_CUBLAS_BACKEND + return ::aligned_alloc(align, size); + #else + return cl::sycl::malloc_shared(size, dev, ctx); + #endif +#endif +} + +static inline void free_shared(void *p, cl::sycl::context ctx) { +#ifdef _WIN64 + cl::sycl::free(p, ctx); +#else + #ifdef ENABLE_CUBLAS_BACKEND + ::free(p); + #else + cl::sycl::free(p, ctx); + #endif +#endif +} + } // namespace onemkl #endif // _TEST_HELPER_HPP_ From 549da82131f066dd117e6470224a101e8da4affb Mon Sep 17 00:00:00 2001 From: "Meterelliyoz, Mesut" Date: Wed, 27 May 2020 23:32:52 -0700 Subject: [PATCH 2/5] Fix missing symbol in windows --- .../mklgpu/mkl_internal_blas_gpu_wrappers.cpp | 16 +++---- .../mklgpu/mkl_internal_blas_sycl_gpu.hpp | 42 +++++++++---------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/src/blas/backends/mklgpu/mkl_internal_blas_gpu_wrappers.cpp b/src/blas/backends/mklgpu/mkl_internal_blas_gpu_wrappers.cpp index ea02a5cc1..95bd9b73c 100644 --- a/src/blas/backends/mklgpu/mkl_internal_blas_gpu_wrappers.cpp +++ b/src/blas/backends/mklgpu/mkl_internal_blas_gpu_wrappers.cpp @@ -2559,8 +2559,8 @@ cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, float *alpha std::int64_t total_group_size = 0; for (std::int64_t i = 0; i < group_count; i++) { cl::sycl::event *axpy_batch_event = new cl::sycl::event( - mkl::gpu::saxpy_batch(queue, n[i], alpha[i], x, incx[i], y, incy[i], group_size[i], - total_group_size, dependencies)); + mkl::gpu::saxpy_batch_sycl(&queue, n[i], alpha[i], x, incx[i], y, incy[i], + group_size[i], total_group_size, dependencies)); coalesced_events.push_back(axpy_batch_event); total_group_size += group_size[i]; } @@ -2576,8 +2576,8 @@ cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, double *alph std::int64_t total_group_size = 0; for (std::int64_t i = 0; i < group_count; i++) { cl::sycl::event *axpy_batch_event = new cl::sycl::event( - mkl::gpu::daxpy_batch(queue, n[i], alpha[i], x, incx[i], y, incy[i], group_size[i], - total_group_size, dependencies)); + mkl::gpu::daxpy_batch_sycl(&queue, n[i], alpha[i], x, incx[i], y, incy[i], + group_size[i], total_group_size, dependencies)); coalesced_events.push_back(axpy_batch_event); total_group_size += group_size[i]; } @@ -2594,8 +2594,8 @@ cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, std::complex std::int64_t total_group_size = 0; for (std::int64_t i = 0; i < group_count; i++) { cl::sycl::event *axpy_batch_event = new cl::sycl::event( - mkl::gpu::caxpy_batch(queue, n[i], alpha[i], x, incx[i], y, incy[i], group_size[i], - total_group_size, dependencies)); + mkl::gpu::caxpy_batch_sycl(&queue, n[i], alpha[i], x, incx[i], y, incy[i], + group_size[i], total_group_size, dependencies)); coalesced_events.push_back(axpy_batch_event); total_group_size += group_size[i]; } @@ -2612,8 +2612,8 @@ cl::sycl::event axpy_batch(cl::sycl::queue &queue, std::int64_t *n, std::complex std::int64_t total_group_size = 0; for (std::int64_t i = 0; i < group_count; i++) { cl::sycl::event *axpy_batch_event = new cl::sycl::event( - mkl::gpu::zaxpy_batch(queue, n[i], alpha[i], x, incx[i], y, incy[i], group_size[i], - total_group_size, dependencies)); + mkl::gpu::zaxpy_batch_sycl(&queue, n[i], alpha[i], x, incx[i], y, incy[i], + group_size[i], total_group_size, dependencies)); coalesced_events.push_back(axpy_batch_event); total_group_size += group_size[i]; } diff --git a/src/blas/backends/mklgpu/mkl_internal_blas_sycl_gpu.hpp b/src/blas/backends/mklgpu/mkl_internal_blas_sycl_gpu.hpp index 6a68a73a0..33ea034ee 100644 --- a/src/blas/backends/mklgpu/mkl_internal_blas_sycl_gpu.hpp +++ b/src/blas/backends/mklgpu/mkl_internal_blas_sycl_gpu.hpp @@ -1575,27 +1575,27 @@ cl::sycl::event zgemm_batch(cl::sycl::queue &queue, MKL_TRANSPOSE transa, MKL_TR int64_t group_size, const cl::sycl::vector_class &dependencies); -cl::sycl::event saxpy_batch(cl::sycl::queue &queue, std::int64_t n, float alpha, const float **x, - std::int64_t incx, float **y, std::int64_t incy, - std::int64_t batch_size, std::int64_t offset, - const cl::sycl::vector_class &dependencies); - -cl::sycl::event daxpy_batch(cl::sycl::queue &queue, std::int64_t n, double alpha, const double **x, - std::int64_t incx, double **y, std::int64_t incy, - std::int64_t batch_size, std::int64_t offset, - const cl::sycl::vector_class &dependencies); - -cl::sycl::event caxpy_batch(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, - const std::complex **x, std::int64_t incx, - std::complex **y, std::int64_t incy, std::int64_t batch_size, - std::int64_t offset, - const cl::sycl::vector_class &dependencies); - -cl::sycl::event zaxpy_batch(cl::sycl::queue &queue, std::int64_t n, std::complex alpha, - const std::complex **x, std::int64_t incx, - std::complex **y, std::int64_t incy, std::int64_t batch_size, - std::int64_t offset, - const cl::sycl::vector_class &dependencies); +cl::sycl::event saxpy_batch_sycl(cl::sycl::queue *queue, std::int64_t n, float alpha, + const float **x, std::int64_t incx, float **y, std::int64_t incy, + std::int64_t batch_size, std::int64_t offset, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event daxpy_batch_sycl(cl::sycl::queue *queue, std::int64_t n, double alpha, + const double **x, std::int64_t incx, double **y, std::int64_t incy, + std::int64_t batch_size, std::int64_t offset, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event caxpy_batch_sycl(cl::sycl::queue *queue, std::int64_t n, std::complex alpha, + const std::complex **x, std::int64_t incx, + std::complex **y, std::int64_t incy, + std::int64_t batch_size, std::int64_t offset, + const cl::sycl::vector_class &dependencies); + +cl::sycl::event zaxpy_batch_sycl(cl::sycl::queue *queue, std::int64_t n, std::complex alpha, + const std::complex **x, std::int64_t incx, + std::complex **y, std::int64_t incy, + std::int64_t batch_size, std::int64_t offset, + const cl::sycl::vector_class &dependencies); cl::sycl::event sgemmt_sycl(cl::sycl::queue *queue, MKL_UPLO upper_lower, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb, int64_t n, int64_t k, float alpha, const float *a, From f4f856cd5f40494aeba40b552eb86ec64fb27e5e Mon Sep 17 00:00:00 2001 From: "Meterelliyoz, Mesut" Date: Thu, 28 May 2020 15:06:42 -0700 Subject: [PATCH 3/5] Fix link line for mklgpu backend --- cmake/FindMKL.cmake | 2 +- src/blas/backends/mklgpu/CMakeLists.txt | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/cmake/FindMKL.cmake b/cmake/FindMKL.cmake index 9a210fba7..9f9e0316c 100644 --- a/cmake/FindMKL.cmake +++ b/cmake/FindMKL.cmake @@ -85,7 +85,7 @@ if (ENABLE_MKLCPU_BACKEND OR ENABLE_MKLGPU_BACKEND) list(APPEND MKL_LINK_C ${TBB_LINK}) endif() if(ENABLE_MKLGPU_BACKEND) - set(MKL_LINK_SYCL ${MKL_LINK_PREFIX} ${LIB_PREFIX}${MKL_SYCL} ${MKL_LINK_C} ${LIB_PREFIX}${OPENCL_LIBNAME} ${SYCL_LINK_FLAGS}) + set(MKL_LINK_SYCL ${SYCL_LINK_FLAGS} ${MKL_LINK_PREFIX} ${LIB_PREFIX}${MKL_SYCL} ${MKL_LINK_C} ${LIB_PREFIX}${OPENCL_LIBNAME} ) endif() endif() diff --git a/src/blas/backends/mklgpu/CMakeLists.txt b/src/blas/backends/mklgpu/CMakeLists.txt index f2f35c831..07fd972ab 100644 --- a/src/blas/backends/mklgpu/CMakeLists.txt +++ b/src/blas/backends/mklgpu/CMakeLists.txt @@ -45,6 +45,7 @@ set_target_properties(${LIB_OBJ} PROPERTIES ) target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ}) +#Set libraries as not transitive for dynamic if(BUILD_SHARED_LIBS) set_target_properties(${LIB_NAME} PROPERTIES INTERFACE_LINK_LIBRARIES ONEMKL::SYCL::SYCL From a58b6e98ce38ea0e90e2f361ac8982ba3ec26f61 Mon Sep 17 00:00:00 2001 From: "Meterelliyoz, Mesut" Date: Thu, 28 May 2020 15:25:57 -0700 Subject: [PATCH 4/5] Update sycl linker --- cmake/FindCompiler.cmake | 5 +++-- cmake/FindMKL.cmake | 3 +-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cmake/FindCompiler.cmake b/cmake/FindCompiler.cmake index e9125af8f..2f5013e24 100644 --- a/cmake/FindCompiler.cmake +++ b/cmake/FindCompiler.cmake @@ -35,6 +35,7 @@ if(SYCL_FOUND AND is_dpcpp) add_library(ONEMKL::SYCL::SYCL INTERFACE IMPORTED) set_target_properties(ONEMKL::SYCL::SYCL PROPERTIES - INTERFACE_COMPILE_OPTIONS "-fsycl" - INTERFACE_LINK_LIBRARIES ${SYCL_LIBRARY}) + INTERFACE_COMPILE_OPTIONS "-fsycl" + INTERFACE_LINK_OPTIONS "-fsycl" + INTERFACE_LINK_LIBRARIES ${SYCL_LIBRARY}) endif() diff --git a/cmake/FindMKL.cmake b/cmake/FindMKL.cmake index 9f9e0316c..a358dd0c0 100644 --- a/cmake/FindMKL.cmake +++ b/cmake/FindMKL.cmake @@ -66,7 +66,6 @@ if(UNIX) list(APPEND MKL_LINK_PREFIX "-L${MKL_LIB_DIR}") set(LIB_PREFIX "-l") set(OPENCL_LIBNAME "OpenCL") - set(SYCL_LINK_FLAGS "-fsycl") else() if(${BUILD_SHARED_LIBS}) set(MKL_COPT ${MKL_COPT} "-Donemkl_EXPORTS") @@ -85,7 +84,7 @@ if (ENABLE_MKLCPU_BACKEND OR ENABLE_MKLGPU_BACKEND) list(APPEND MKL_LINK_C ${TBB_LINK}) endif() if(ENABLE_MKLGPU_BACKEND) - set(MKL_LINK_SYCL ${SYCL_LINK_FLAGS} ${MKL_LINK_PREFIX} ${LIB_PREFIX}${MKL_SYCL} ${MKL_LINK_C} ${LIB_PREFIX}${OPENCL_LIBNAME} ) + set(MKL_LINK_SYCL ${MKL_LINK_PREFIX} ${LIB_PREFIX}${MKL_SYCL} ${MKL_LINK_C} ${LIB_PREFIX}${OPENCL_LIBNAME} ) endif() endif() From ebb4d6a6efff8d122eec01b3a19842042767935f Mon Sep 17 00:00:00 2001 From: "Meterelliyoz, Mesut" Date: Thu, 28 May 2020 21:33:06 -0700 Subject: [PATCH 5/5] Remove warnings due to fsycl in Windows --- cmake/FindCompiler.cmake | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/cmake/FindCompiler.cmake b/cmake/FindCompiler.cmake index 2f5013e24..ab0504af8 100644 --- a/cmake/FindCompiler.cmake +++ b/cmake/FindCompiler.cmake @@ -34,8 +34,15 @@ if(SYCL_FOUND AND is_dpcpp) find_library(SYCL_LIBRARY NAMES sycl PATHS "${SYCL_BINARY_DIR}/../lib") add_library(ONEMKL::SYCL::SYCL INTERFACE IMPORTED) - set_target_properties(ONEMKL::SYCL::SYCL PROPERTIES - INTERFACE_COMPILE_OPTIONS "-fsycl" - INTERFACE_LINK_OPTIONS "-fsycl" - INTERFACE_LINK_LIBRARIES ${SYCL_LIBRARY}) + if(UNIX) + set_target_properties(ONEMKL::SYCL::SYCL PROPERTIES + INTERFACE_COMPILE_OPTIONS "-fsycl" + INTERFACE_LINK_OPTIONS "-fsycl" + INTERFACE_LINK_LIBRARIES ${SYCL_LIBRARY}) + else() + set_target_properties(ONEMKL::SYCL::SYCL PROPERTIES + INTERFACE_COMPILE_OPTIONS "-fsycl" + INTERFACE_LINK_LIBRARIES ${SYCL_LIBRARY}) + endif() + endif()